<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xml:lang="ru"><front><journal-meta><journal-id journal-id-type="publisher-id">bsuir</journal-id><journal-title-group><journal-title xml:lang="ru">Доклады БГУИР</journal-title><trans-title-group xml:lang="en"><trans-title>Doklady BGUIR</trans-title></trans-title-group></journal-title-group><issn pub-type="ppub">1729-7648</issn><issn pub-type="epub">2708-0382</issn><publisher><publisher-name>БГУИР</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.35596/1729-7648-2020-18-2-23-29</article-id><article-id custom-type="elpub" pub-id-type="custom">bsuir-2637</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="ru"><subject>ЭЛЕКТРОНИКА, РАДИОФИЗИКА, РАДИОТЕХНИКА, ИНФОРМАТИКА</subject></subj-group><subj-group subj-group-type="section-heading" xml:lang="en"><subject>ELECTRONICS, RADIOPHYSICS, RADIOENGINEERING, INFORMATICS</subject></subj-group></article-categories><title-group><article-title>AN EFFICIENT SPEECH GENERATIVE MODEL BASED ON DETERMINISTIC/STOCHASTIC SEPARATION OF SPECTRAL ENVELOPES</article-title><trans-title-group xml:lang="en"><trans-title>AN EFFICIENT SPEECH GENERATIVE MODEL BASED ON DETERMINISTIC/STOCHASTIC SEPARATION OF SPECTRAL ENVELOPES</trans-title></trans-title-group></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Taha</surname><given-names>M.</given-names></name><name name-style="western" xml:lang="en"><surname>Taha</surname><given-names>M.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Minsk</p></bio><bio xml:lang="en"><p>Mostafa Taha, Master of Sciences, PhD student of Computer Engineering Department</p><p>Minsk</p></bio><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Azarov</surname><given-names>E. S.</given-names></name><name name-style="western" xml:lang="en"><surname>Azarov</surname><given-names>E. S.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Minsk</p></bio><bio xml:lang="en"><p>Elias S. Azarov, D.Sci., Professor of Computer Engineering Department</p><p>Minsk</p></bio><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Likhachov</surname><given-names>D. S.</given-names></name><name name-style="western" xml:lang="en"><surname>Likhachov</surname><given-names>D. S.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Minsk</p></bio><bio xml:lang="en"><p>Likhachov Denis Sergeevich, PhD, Associate Professor of Computer Engineering Department</p><p>220013, Republic of Belarus, Minsk, P. Brovki str., 6; tel. +375172938805</p></bio><email xlink:type="simple">likhachov@bsuir.by</email><xref ref-type="aff" rid="aff-1"/></contrib><contrib contrib-type="author" corresp="yes"><name-alternatives><name name-style="eastern" xml:lang="ru"><surname>Petrovsky</surname><given-names>A. А.</given-names></name><name name-style="western" xml:lang="en"><surname>Petrovsky</surname><given-names>A. A.</given-names></name></name-alternatives><bio xml:lang="ru"><p>Minsk</p></bio><bio xml:lang="en"><p>Alexander A. Petrovsky, D.Sci., Professor of Computer Engineering Department</p><p>Minsk</p></bio><xref ref-type="aff" rid="aff-1"/></contrib></contrib-group><aff-alternatives id="aff-1"><aff xml:lang="ru"><institution>Belarusian State University of Informatics and Radioelectronics</institution></aff><aff xml:lang="en"><institution>Belarusian State University of Informatics and Radioelectronics</institution></aff></aff-alternatives><pub-date pub-type="collection"><year>2020</year></pub-date><pub-date pub-type="epub"><day>31</day><month>03</month><year>2020</year></pub-date><volume>18</volume><issue>2</issue><fpage>23</fpage><lpage>29</lpage><permissions><copyright-statement>Copyright &amp;#x00A9; Taha M., Azarov E.S., Likhachov D.S., Petrovsky A.А., 2020</copyright-statement><copyright-year>2020</copyright-year><copyright-holder xml:lang="ru">Taha M., Azarov E.S., Likhachov D.S., Petrovsky A.А.</copyright-holder><copyright-holder xml:lang="en">Taha M., Azarov E.S., Likhachov D.S., Petrovsky A.A.</copyright-holder><license xml:lang="ru" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>Данная работа распространяется под лицензией Creative Commons Attribution 4.0.</license-p></license><license xml:lang="en" license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This work is licensed under a Creative Commons Attribution 4.0 License.</license-p></license></permissions><self-uri xlink:href="https://doklady.bsuir.by/jour/article/view/2637">https://doklady.bsuir.by/jour/article/view/2637</self-uri><abstract><p>The paper presents a speech generative model that provides an efficient way of generating speech waveform from its amplitude spectral envelopes. The model is based on hybrid speech representation that includes deterministic (harmonic) and stochastic (noise) components. The main idea behind the approach originates from the fact that speech signal has a determined spectral structure that is statistically bound with deterministic/stochastic energy distribution in the spectrum. The performance of the model is evaluated using an experimental low-bitrate wide-band speech coder. The quality of reconstructed speech is evaluated using objective and subjective methods. Two objective quality characteristics were calculated: Modified Bark Spectral Distortion (MBSD) and Perceptual Evaluation of Speech Quality (PESQ). Narrow-band and wide-band versions of the proposed solution were compared with MELP (Mixed Excitation Linear Prediction) speech coder and AMR (Adaptive Multi-Rate) speech coder, respectively. The speech base of two female and two male speakers were used for testing. The performed tests show that overall performance of the proposed approach is speaker-dependent and it is better for male voices. Supposedly, this difference indicates the influence of pitch highness on separation accuracy. In that way, using the proposed approach in experimental speech compression system provides decent MBSD values and comparable PESQ values with AMR speech coder at 6,6 kbit/s. Additional subjective listening testsdemonstrate that the implemented coding system retains phonetic content and speaker’s identity. It proves consistency of the proposed approach.</p></abstract><trans-abstract xml:lang="en"><p>The paper presents a speech generative model that provides an efficient way of generating speech waveform from its amplitude spectral envelopes. The model is based on hybrid speech representation that includes deterministic (harmonic) and stochastic (noise) components. The main idea behind the approach originates from the fact that speech signal has a determined spectral structure that is statistically bound with deterministic/stochastic energy distribution in the spectrum. The performance of the model is evaluated using an experimental low-bitrate wide-band speech coder. The quality of reconstructed speech is evaluated using objective and subjective methods. Two objective quality characteristics were calculated: Modified Bark Spectral Distortion (MBSD) and Perceptual Evaluation of Speech Quality (PESQ). Narrow-band and wide-band versions of the proposed solution were compared with MELP (Mixed Excitation Linear Prediction) speech coder and AMR (Adaptive Multi-Rate) speech coder, respectively. The speech base of two female and two male speakers were used for testing. The performed tests show that overall performance of the proposed approach is speaker-dependent and it is better for male voices. Supposedly, this difference indicates the influence of pitch highness on separation accuracy. In that way, using the proposed approach in experimental speech compression system provides decent MBSD values and comparable PESQ values with AMR speech coder at 6,6 kbit/s. Additional subjective listening testsdemonstrate that the implemented coding system retains phonetic content and speaker’s identity. It proves consistency of the proposed approach.</p></trans-abstract><kwd-group xml:lang="ru"><kwd>speech generative model</kwd><kwd>harmonic plus noise model</kwd><kwd>speech analysis</kwd><kwd>speech coding</kwd></kwd-group><kwd-group xml:lang="en"><kwd>speech generative model</kwd><kwd>harmonic plus noise model</kwd><kwd>speech analysis</kwd><kwd>speech coding</kwd></kwd-group></article-meta></front><back><ref-list><title>References</title><ref id="cit1"><label>1</label><citation-alternatives><mixed-citation xml:lang="ru">A. van den Oord, Dieleman S., Zen H., Simonyan K., Vinyals O., Graves A., Kalchbrenner N., Senior A., Kavukcuoglu K. WaveNet: A generative model for raw audio, arXiv:1609.03499, 2016.</mixed-citation><mixed-citation xml:lang="en">A. van den Oord, Dieleman S., Zen H., Simonyan K., Vinyals O., Graves A., Kalchbrenner N., Senior A., Kavukcuoglu K. WaveNet: A generative model for raw audio, arXiv:1609.03499, 2016.</mixed-citation></citation-alternatives></ref><ref id="cit2"><label>2</label><citation-alternatives><mixed-citation xml:lang="ru">Shen J., Pang R., Weiss R. J., Schuster M., Jaitly N., Yang Z., Chen Z., Zhang Y., Wang Y., Skerrv-Ryan R. “Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions,” in Proc. International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 2018:4779-4783.</mixed-citation><mixed-citation xml:lang="en">Shen J., Pang R., Weiss R. J., Schuster M., Jaitly N., Yang Z., Chen Z., Zhang Y., Wang Y., Skerrv-Ryan R. “Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions,” in Proc. International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 2018:4779-4783.</mixed-citation></citation-alternatives></ref><ref id="cit3"><label>3</label><citation-alternatives><mixed-citation xml:lang="ru">Arik S., Diamos G., Gibiansky A., Miller J., Peng K., Ping W., Raiman J., and Zhou Y. Deep voice 2: Multi-speaker neural text-to-speech. arXiv:1705.08947, 2017.</mixed-citation><mixed-citation xml:lang="en">Arik S., Diamos G., Gibiansky A., Miller J., Peng K., Ping W., Raiman J., and Zhou Y. Deep voice 2: Multi-speaker neural text-to-speech. arXiv:1705.08947, 2017.</mixed-citation></citation-alternatives></ref><ref id="cit4"><label>4</label><citation-alternatives><mixed-citation xml:lang="ru">Valin J.-V., Skoglund J. LPCNet: Improving neural speech synthesis through linear prediction, arXiv:1810.11846</mixed-citation><mixed-citation xml:lang="en">Valin J.-V., Skoglund J. LPCNet: Improving neural speech synthesis through linear prediction, arXiv:1810.11846</mixed-citation></citation-alternatives></ref><ref id="cit5"><label>5</label><citation-alternatives><mixed-citation xml:lang="ru">Griffin D., Lim J. A new model-based speech analysis/synthesis system. In Proc. International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 1985;10:513-516.</mixed-citation><mixed-citation xml:lang="en">Griffin D., Lim J. A new model-based speech analysis/synthesis system. In Proc. International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 1985;10:513-516.</mixed-citation></citation-alternatives></ref><ref id="cit6"><label>6</label><citation-alternatives><mixed-citation xml:lang="ru">Laroche J., Stylianou Y., Moulines E. HNS: Speech modification based on a harmonic+noise model. Proceedings of the ICASSP-93 IEEE International Conference on Acoustics, Speech, and Signal Processing, 1993;2:550-553.</mixed-citation><mixed-citation xml:lang="en">Laroche J., Stylianou Y., Moulines E. HNS: Speech modification based on a harmonic+noise model. Proceedings of the ICASSP-93 IEEE International Conference on Acoustics, Speech, and Signal Processing, 1993;2:550-553.</mixed-citation></citation-alternatives></ref><ref id="cit7"><label>7</label><citation-alternatives><mixed-citation xml:lang="ru">Serra X. Musical sound modeling with sinusoids plus noise. Musical Signal Processing (C. Roads, S. Popea, A. Picialli, G. De Poli Eds.). Swets &amp; Zeitlinger Publishers; 1997.</mixed-citation><mixed-citation xml:lang="en">Serra X. Musical sound modeling with sinusoids plus noise. Musical Signal Processing (C. Roads, S. Popea, A. Picialli, G. De Poli Eds.). Swets &amp; Zeitlinger Publishers; 1997.</mixed-citation></citation-alternatives></ref><ref id="cit8"><label>8</label><citation-alternatives><mixed-citation xml:lang="ru">Azarov E., Petrovsky A. Instantaneous harmonic analysis for vocal processing. Proceedings of DAFx-09. Como, Italy, September 14; 2009.</mixed-citation><mixed-citation xml:lang="en">Azarov E., Petrovsky A. Instantaneous harmonic analysis for vocal processing. Proceedings of DAFx-09. Como, Italy, September 14; 2009.</mixed-citation></citation-alternatives></ref></ref-list><fn-group><fn fn-type="conflict"><p>The authors declare that there are no conflicts of interest present.</p></fn></fn-group></back></article>
