@inbook{4802cebb8c0e4e31b53c09dc06543a4e,
title = "Dual-branch StarNet with Mutual Attention and U-Net Denoising for Simultaneously Recognizing Keywords and Speakers",
abstract = "Deep neural networks have demonstrated remarkable capabilities in speech-recognition tasks such as keyword spotting and speaker verification, but these applications face significant challenges in real-world noisy environments. To tackle the challenges, we propose an end-to-end network, Dual-Branch StarNet with Mutual Attention and U-Net Denoising (DSN-MAUD), to perform denoising, keyword spotting, and speaker recognition simultaneously. The proposed model consists of three blocks. Firstly, after obtaining the spectrogram from audio signals, a U-Net encoder-decoder block is designed for noise removal by mapping noisy spectrograms to clean spectrograms. Then, a Dual-Branch StarNet block is designed, where a shared backbone is utilized to extract common features for both keyword spotting and speaker verification, followed by two dedicated network branches for keyword spotting and speaker verification to extract their discriminant features, respectively. Finally, a Mutual Attention block is designed to exploit the attentive features within these two sets of features to enhance their discriminative power. To evaluate the proposed method in real-world scenarios, we inject various types of noise at different noise levels into the Google Speech Command dataset and perform a comprehensive set of experiments on the generated dataset. Experiment results show that the proposed method outperforms state-of-the-art methods on both keyword spotting and speaker verification tasks under all evaluation scenarios.",
keywords = "Keyword Spotting, Speaker Verification, StarNet, Mutual Attention, Multi-task Learning",
author = "Yuting HE and Chengtai Li and Heng Yu and Jianfeng Ren and Zheng Wang and Heshan Du and Yinshui Xia",
year = "2025",
month = jun,
day = "24",
doi = "10.1007/978-981-96-6588-4_20",
language = "English",
isbn = "9789819665877",
series = "Lecture Notes in Computer Science",
publisher = "Springer",
editor = "Mahmud, {Mufti } and Doborjeh, {Maryam } and Doborjeh, {Zohreh } and Wong, {Kevin } and Leung, {Andrew Chi Sing } and Tanveer, {M. }",
booktitle = "Neural Information Processing :31st International Conference, ICONIP 2024, Auckland, New Zealand, December 2–6, 2024, Proceedings, Part V",
address = "Singapore",
}