https://hal.archives-ouvertes.fr/hal-02963528
Tip revision: 282551cd4868b7b38f2c72e9b0ac84a22e7b8411 authored by Software Heritage on 01 January 2017, 00:00:00 UTC
hal: Deposit 1043 in collection hal
hal: Deposit 1043 in collection hal
Tip revision: 282551c
main_compute_speaker_models.m
function main_compute_speaker_models
% main_compute_speaker_models
%
% Main function to compute clean speaker models with paper's parameters
%
% Before you start to use this code :
% - Download and install FASST v2 toolbox
% (https://gitlab.inria.fr/bass-db/fasst)
% - Download and extract voiceHome-2 corpus (https://doi.org/10.5281/zenodo.1252143)
% - Update paths in the "USER PARAMS" section
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Copyright 2017 Ewen Camberlein and Romain Lebarbenchon
% This software is distributed under the terms of the GNU Public License
% version 3 (http://www.gnu.org/licenses/gpl.txt)
% If you find it useful, please cite the following reference:
% - Nancy Bertin, Ewen Camberlein, Romain Lebarbenchon, Emmanuel Vincent,
% Sunit Sivasankaran, Irina Illina, Frédéric Bimbot
% "VoiceHome-2, an extended corpus for multichannelspeech processing in
% real homes", submitted to Speech Communication, Elsevier, 2017
%
% Contact : nancy.bertin[at]irisa.fr
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% USER PARAMS
fasst_matlab_dir = 'C:/Program Files/fasst 2.1.0/scripts/MATLAB/'; % Path to FASST Matlab scripts on your computer
corpusPath = 'C:\VBox_Partage\interspeech2\voiceHome-2_corpus_v1.0\'; % Corpus path
%% Add FASST directory to PATH
addpath(fasst_matlab_dir);
addpath('./FASST_Framework/'); % FASST framework functions
mkdir('./Models_Spk/');
%% PAPER'S PARAMS
transformType = {'ERB','STFT'}; % STFT / ERB
spk_learning_mode = {'spk_nodep','spk_dep'}; % spk_dep/spk_nodep
spkIdList = {'F1','F2','F3','F4','M1','M2','M3','M4','M5','M6','M7','M8'}'; % speakers id list
wlen = '1024'; % window length (frame length in time domain) - % should be multiple of 4 for STFT and multiple of 2 for ERB
nbin_ERB = '8'; % number of frequency coefficient for ERB transform type
speech_model_type = 'close_field';
spk_NMF_order = '32';
spk_iters = '50';
%% Learn speaker's models
for i = 1:length(transformType)
for j = 1:length(spk_learning_mode)
computeSpkModels_corpora(corpusPath,speech_model_type,spk_learning_mode{j},spk_iters,spk_NMF_order,transformType{i},wlen,nbin_ERB,spkIdList);
end
end
end
function [] = computeSpkModels_corpora(corpusPath,speech_model_type,spk_learning_mode,spk_iters,spk_NMF_order,transformType,wlen,nbin_ERB,spkIdList)
% computeSpkModels_corpora
%
% This function computes clean speaker models for each element of
% spkIdList.
%
% Models are saved in ./Models_Spk/
%
% [] = computeSpkModels_corpora(corpusPath,speech_model_type,spk_learning_mode,spk_iters,spk_NMF_order,transformType,wlen,nbin_ERB,spkIdList)
%
% INPUTS:
%
% speech_model_type : String, 'close_field'
% spk_learning_mode : String, 'spk_dep', 'spk_nodep'
% spk_dep : Use only utterance of the speaker
% spk_nodep : Use all utterances except for the current speaker
% spk_iters : String, number of EM iterations
% spk_NMF_order : NMF order for the spectral model init
% transformType : String,Time-Frequency transform
% Choices : 'ERB' or 'STFT'
% wlen : String, time segment length (in samples)
% nbin_ERB : String, number of bins for the ERB Transform
% spkIdList : Cell of strings, index of speakers
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Copyright 2017 Ewen Camberlein and Romain Lebarbenchon
% This software is distributed under the terms of the GNU Public License
% version 3 (http://www.gnu.org/licenses/gpl.txt)
% If you find it useful, please cite the following reference:
% - Nancy Bertin, Ewen Camberlein, Romain Lebarbenchon, Emmanuel Vincent,
% Sunit Sivasankaran, Irina Illina, Frédéric Bimbot
% "VoiceHome-2, an extended corpus for multichannelspeech processing in
% real homes", submitted to Speech Communication, Elsevier, 2017
%
% Contact : nancy.bertin[at]irisa.fr
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Get function name (error management)
functionName = mfilename('fullpath');
idSlash = find(functionName == '/');
if(isempty(idSlash))
idSlash = find(functionName == '\');
end
functionName = functionName(idSlash(end)+1:end);
%% USER PARAMETERS
wlen = eval(wlen);
nbin_ERB = eval(nbin_ERB);
spk_NMF_order = eval(spk_NMF_order); % Nombre d'états de la VQ (32 spk)
spk_iters = eval(spk_iters); % Nombre iter de l'EM
sUserParams = ...
struct(...
... % static
'wlen', wlen, ...
'nbin_ERB', nbin_ERB, ... % init value
'spk_NMF_order', spk_NMF_order, ... % Nb de SPK dans chaque fichier
'spk_iters', spk_iters, ...
'transformType', transformType, ...
'speech_model_type', speech_model_type, ...
'spk_learning_mode', spk_learning_mode, ...
... % dynamic
'spkId', [] ...
);
%% CAMPAIGN PARAMETERS
sCampaignParams = ...
struct(...
... % static
'iFs', 16000, ...
'nspk', 1, ... % Nb de SPK dans chaque fichier
'spat_meth_default', 'Direction', ... 'Direction' / 'Position'
'use_diffuse_model', 0, ...
'back_multiple_sources', 0, ...
... % dynamic
'iNbChannels', 0, ... % init value
'pfRoomDimensions', [], ...
'fRT60', [], ...
'sound_velocity', 343 ...
);
% iNbChannels
switch speech_model_type
case 'close_field'
sCampaignParams.iNbChannels = 1;
otherwise
error(['[Error:' functionName '.m]' 'speech model type unknown:' speech_model_type]);
end
%% PATHS management
rootProjectDir='./';
sPaths = ...
struct(...
'spk_models_dir', [rootProjectDir 'Models_Spk/'], ... %output directory
'learning_data_dir', '', ... % init value
'tmp_dir', '' ... % init value
);
% Declare audio files directories for learning models
switch speech_model_type
case 'close_field'
sPaths.learning_data_dir = [corpusPath 'audio/clean/']; % Données condition close field
otherwise
error('Not coded');
end
% Manage tmp directories : create a new tmp directory at each function call
tempDirInfos = dir(['./../tmp_' speech_model_type '*']);
if(isempty(tempDirInfos))
idTempDir = 1;
else
existingId = zeros(1,length(tempDirInfos));
for t = 1:length(tempDirInfos)
subNumId = length(tempDirInfos(t).name)-length(['tmp_' speech_model_type]);
existingId(t) = str2num(tempDirInfos(t).name(end-(subNumId-1):end));
end
idTempDir = max(existingId)+1;
end
sPaths.tmp_dir = [rootProjectDir '/tmp_' speech_model_type num2str(idTempDir) '/']; % Répertoire temporaire
if ~exist(sPaths.tmp_dir, 'dir')
mkdir(sPaths.tmp_dir);
end
%% PROCESS
%% LOOP on spkIdList
for spkId = spkIdList'
% list wav files in learning dir
fprintf('Speaker %s model generation\n',spkId{:});
Infos_wavLearningDir = dir([sPaths.learning_data_dir '*.wav']);
nWavFiles_learningDir = length(Infos_wavLearningDir);
list_wavFilesLearningDir = cell(1,nWavFiles_learningDir);
for n=1:nWavFiles_learningDir
list_wavFilesLearningDir{n} = Infos_wavLearningDir(n).name;
end
switch spk_learning_mode
case 'spk_dep'
% Use only utterence of speaker spkId
b = ~cellfun(@isempty,regexp(list_wavFilesLearningDir,['speaker' spkId{:}]));
case 'spk_nodep'
% Use utterences of all speaker except speaker spkId
b = cellfun(@isempty,regexp(list_wavFilesLearningDir,['speaker' spkId{:}]));
end
list_wavFilesForLearning = list_wavFilesLearningDir(b);
% compute model from selected list of wav files
sUserParams.spkId = spkId{:};
nFiles = length(list_wavFilesForLearning);
audio_segments = cell(1,nFiles);
for n=1:nFiles
fname_clean = list_wavFilesForLearning{n};
fname_clean = fname_clean(1:end-4);
audio_segments{n} = wavOpening(corpusPath,fname_clean,'clean','wuw_cmd',[0 0],NaN,sCampaignParams.iFs);
end
computeSpkModels(sUserParams, sCampaignParams, sPaths, audio_segments);
end
rmdir(sPaths.tmp_dir,'s');
end