#!python3
"""
Wrapper for the dictionary learning algorithm. When invoked, the audio
sources in the supplied audio file are separated.
"""
from __future__ import absolute_import, division, print_function
import numpy as np
import sys
import os.path
import pickle
import matplotlib
matplotlib.use('Agg')
#import matplotlib.pyplot as plt
import matplotlib.cm as cm
from ..audio import spect
from ..audio import wav
from ..audio import performance
from . import dictlearn
[docs]def correct_signal_length(signal, length):
"""
Right-pad or right-crop the signal such that it fits the desired length.
Arguments
---------
signal : ndarray
Signal to be adjusted
length : int
Desired length of the signal
Returns
-------
ndarray
Adjusted signal
"""
if signal.size > length:
return signal[:length]
elif signal.size < length:
return np.concatenate([signal, np.zeros(length - signal.size)])
else:
return signal
[docs]def main(mixed_soundfile, orig_soundfiles, out_name, out_name_run_suffix="",
inst_num=2, tone_num=1, pexp=1, qexp=0.5, har=25, sigmas=6,
sampdist=256, spectheight=6*1024, logspectheight=1024, minfreq=20,
maxfreq=20480, runs=10000, lifetime=500, num_dicts=10, mask=True,
color=False, plot_range=None, spect_method="pursuit",
supply_dicts=None, spect_plots=()):
"""
Wrapper function for the dictionary learning algorithm.
Parameters
----------
mixed_soundfile : string
Name of the mixed input file
orig_soundfiles : list of string or NoneType
Names of the files with the isolated instrument tracks or None
out_name : string
Prefix for the file names
out_name_suffix : string
Extra label for the output files
inst_num : int
Number of instruments
tone_num : int
Maximum number of simultaneous tones for each instrument
pexp : float
Exponent for the addition of sinusoids
qexp : float
Exponent to be applied on the spectrum
har : int
Number of harmonics
sigmas : float
Number of standard deviations after which to cut the window/kernel
sampdist : int
Time intervals to sample the spectrogram
spectheight : int
Height of the linear-frequency spectrogram
logspectheight : int
Height of the log-frequency spectrogram
minfreq : float
Minimum frequency in Hz to be represented (included)
maxfreq : float
Maximum frequency in Hz to be represented (excluded)
runs : int
Number of training iterations to perform
lifetime : int
Number of steps after which to renew the dictionary
num_dicts : int
Number of different dictionaries to generate and train
mask : bool
Whether to apply spectral masking
color : bool or string
Whether color should be used, or specification of the color scheme
plot_range : slice or NoneType
Part of the spectrogram to plot
spect_method : string
If set to `"mel"`, a mel spectrogram is used for separation.
Otherwise, the log-frequency spectrogram is generated via
sparse pursuit.
supply_dicts : NoneType or list of array_like
Is specified, use the given dictionaries rather than computing
new ones
spect_plots : sequence of int
Time frames for which to output the spectrum as a text file
Returns
-------
inst_dicts : list of ndarray
Dictionaries that were used for the separation
"""
signal, samprate = wav.read(mixed_soundfile)
plotlen = signal.size
freqrange = np.linspace(0, samprate/2000, spectheight, endpoint=False)
orig_spectrum = spect.spectrogram(
signal, spectheight, sigmas, sampdist)[:spectheight, :]
if plot_range is not None:
spect.spectwrite('output/{}-orig.png'.format(out_name),
orig_spectrum[:spectheight, plot_range],
color)
for sp in spect_plots:
np.savetxt('output/{}-orig-{}.dat'.format(out_name, sp),
np.stack([freqrange, orig_spectrum[:spectheight, sp]], axis=1))
if orig_soundfiles is None:
orig_signals = None
else:
orig_signals = np.asarray(
[correct_signal_length(wav.read(f)[0], signal.size)
for f in orig_soundfiles])
orig_spectrums = [spect.spectrogram(
os, spectheight, sigmas, sampdist)[:spectheight, :]
for os in orig_signals]
fsigma = sigmas/np.pi
if (os.path.exists('output/{}-lin.npy'.format(out_name))
and os.path.exists('output/{}-log.npy'.format(out_name))
and os.path.exists('output/{}-stretch.npy'.format(out_name))):
linspect = np.load('output/{}-lin.npy'.format(out_name))
logspect = np.load('output/{}-log.npy'.format(out_name))
stretch = np.load('output/{}-stretch.npy'.format(out_name))
elif spect_method == "mel":
stretch = (logspectheight / np.log(maxfreq/minfreq)
/ (minfreq / samprate * 2 * spectheight))
print("stretch: {}".format(stretch))
logspect, linspect = spect.logspect_mel(signal, spectheight,
sigmas, sampdist,
minfreq/samprate,
minfreq/samprate,
maxfreq/samprate,
logspectheight)
logspect = np.sqrt(logspect)
linspect = np.sqrt(linspect)
np.save('output/{}-lin.npy'.format(out_name), linspect)
np.save('output/{}-log.npy'.format(out_name), logspect)
np.save('output/{}-stretch.npy'.format(out_name), stretch)
else:
logspect, linspect = spect.logspect_pursuit(signal, spectheight,
sigmas, sampdist, None,
minfreq/samprate,
maxfreq/samprate,
logspectheight, fsigma)
stretch = 1
np.save('output/{}-lin.npy'.format(out_name), linspect)
np.save('output/{}-log.npy'.format(out_name), logspect)
np.save('output/{}-stretch.npy'.format(out_name), stretch)
if plot_range is not None:
spect.spectwrite('output/{}-lin.png'.format(out_name),
linspect[:, plot_range], color)
spect.spectwrite('output/{}-log.png'.format(out_name),
logspect[:, plot_range], color)
for sp in spect_plots:
np.savetxt('output/{}-lin-{}.dat'.format(out_name, sp),
np.stack([freqrange, linspect[:, sp]], axis=1))
audio_measures = []
inst_dicts = []
for r in range(0, num_dicts):
print("seed: {}".format(r))
out_name_run = out_name + out_name_run_suffix + '-{}'.format(r)
np.random.seed(r)
if supply_dicts is not None:
inst_dict = np.asarray(supply_dicts[r])
elif os.path.exists('output/{}-dict.npy'.format(out_name_run)):
inst_dict = np.load('output/{}-dict.npy'.format(out_name_run))
else:
inst_dict = dictlearn.learn_spect_dict(
logspect, fsigma*stretch, tone_num, inst_num * 2, pexp, qexp,
har, minfreq, maxfreq, runs, lifetime)
np.save('output/{}-dict.npy'.format(out_name_run), inst_dict)
print(inst_dict)
inst_dicts.append(inst_dict)
if os.path.exists('output/{}-spect.pkl'.format(out_name_run)):
[dict_spectrum, inst_spectrums,
dict_spectrum_lin, inst_spectrums_lin] = \
pickle.load(open('output/{}-spect.pkl'.format(out_name_run),
'rb'))
else:
(dict_spectrum, inst_spectrums,
dict_spectrum_lin, inst_spectrums_lin) = \
dictlearn.synth_spect(
logspect, tone_num, inst_dict, fsigma*stretch,
spectheight, pexp, qexp,
minfreq/samprate, maxfreq/samprate, stretch)
pickle.dump([dict_spectrum, inst_spectrums,
dict_spectrum_lin, inst_spectrums_lin],
open('output/{}-spect.pkl'.format(out_name_run), 'wb'))
if mask:
inst_spectrums_lin, mask_spect = dictlearn.mask_spectrums(
inst_spectrums_lin, orig_spectrum)
dict_spectrum_lin = dict_spectrum_lin * mask_spect
mask_str = "mask"
else:
mask_str = "nomask"
if plot_range is not None:
spect.spectwrite('output/{}-synth.png'
.format(out_name_run),
dict_spectrum[:, plot_range], color)
spect.spectwrite('output/{}-synth-lin-{}.png'
.format(out_name_run, mask_str),
dict_spectrum_lin[:, plot_range], color)
for sp in spect_plots:
np.savetxt('output/{}-synth-lin-{}-{}.dat'.format(out_name_run, mask_str, sp),
np.stack([freqrange, dict_spectrum_lin[:, sp]], axis=1))
for i in range(len(inst_spectrums)):
spect.spectwrite(
'output/{}-synth{}.png'
.format(out_name_run, i),
inst_spectrums[i][:, plot_range], color)
spect.spectwrite(
'output/{}-synth{}-lin-{}.png'
.format(out_name_run, i, mask_str),
inst_spectrums_lin[i][:, plot_range], color)
for sp in spect_plots:
np.savetxt('output/{}-synth{}-lin-{}-{}.dat'.format(out_name_run, i, mask_str, sp),
np.stack([inst_spectrums_lin[i][:, sp]], axis=1))
siglen = signal.size
synth_signals = np.zeros((inst_num, siglen))
audio, _ = spect.synth_audio(dict_spectrum_lin, siglen,
sigmas, sampdist, 1, signal)
wav.write('output/{}-synth-{}.wav'.format(out_name_run, mask_str),
audio, samprate)
for i in range(len(inst_spectrums_lin)):
audio, _ = spect.synth_audio(inst_spectrums_lin[i],
siglen, sigmas, sampdist, 1,
signal)
synth_signals[i, :] = audio
wav.write('output/{}-synth{}-{}.wav'
.format(out_name_run, i, mask_str),
audio, samprate)
if orig_signals is not None:
perm, perf = performance.select_perm(*performance.measures(
synth_signals, orig_signals))
audio_measures.append(perf)
print("Permutation:")
print(perm)
print("Performance:")
print(perf)
if orig_signals is not None:
audio_measures = np.asarray(audio_measures)
print("Global measures mean:")
print(np.mean(audio_measures, axis=0))
print("Global measures stdev:")
print(np.std(audio_measures, axis=0, ddof=1))
bestidx = np.argmax(np.sum(audio_measures, axis=2)[:, 0])
print("Global measures best index: {}".format(bestidx))
print("Global measures best:")
print(audio_measures[bestidx, :, :])
np.savetxt('output/{}{}-{}-measures.dat'
.format(out_name, out_name_run_suffix, mask_str),
np.reshape(audio_measures, [num_dicts, 3 * inst_num]))
return inst_dicts
[docs]def separate_mozart_recorder_violin():
"Separation of recorder and violin on the piece by Mozart"
main(mixed_soundfile='input/mozart/mix.wav',
orig_soundfiles=['input/mozart/recorder.wav',
'input/mozart/violin.wav'],
out_name='mozart/mozart',
runs=100000,
mask=True,
plot_range=slice(0, 1580))
main(mixed_soundfile='input/mozart/mix.wav',
orig_soundfiles=['input/mozart/recorder.wav',
'input/mozart/violin.wav'],
out_name='mozart/mozart',
runs=100000,
mask=False,
plot_range=slice(0, 1580))
[docs]def separate_mozart_recorder_violin_mel():
"Separation of recorder and violin on the piece by Mozart"
main(mixed_soundfile='input/mozart/mix.wav',
orig_soundfiles=['input/mozart/recorder.wav',
'input/mozart/violin.wav'],
out_name='mozart_mel/mozart',
minfreq=200,
runs=100000,
mask=True,
plot_range=slice(0, 1580),
spect_method="mel")
main(mixed_soundfile='input/mozart/mix.wav',
orig_soundfiles=['input/mozart/recorder.wav',
'input/mozart/violin.wav'],
out_name='mozart_mel/mozart',
minfreq=200,
runs=100000,
mask=False,
plot_range=slice(0, 1580),
spect_method="mel")
[docs]def separate_mozart_clarinet_piano():
"Separation of clarinet and piano on the piece by Mozart"
main(mixed_soundfile='input/mozart-cl/mix-cl-piano.wav',
orig_soundfiles=['input/mozart-cl/clarinet-high.wav',
'input/mozart-cl/piano-low.wav'],
out_name='mozart-cl/mozart',
runs=100000)
[docs]def separate_mozart_piano_mock():
"Mock separation of the piano track."
main(mixed_soundfile='input/mozart-cl/piano-low.wav',
orig_soundfiles=['input/mozart-cl/piano-low.wav'],
out_name='mozart-cl/mozart-mock',
runs=100000,
inst_num=1,
mask=False,
plot_range=slice(0, 1580),
spect_plots=[100]
)
[docs]def separate_urmp():
"Separation of selected samples from the URMP dataset."
main(mixed_soundfile='input/URMP/AuMix_03_Dance_fl_cl.wav',
orig_soundfiles=['input/URMP/AuSep_1_fl_03_Dance.wav',
'input/URMP/AuSep_2_cl_03_Dance.wav'],
out_name='URMP/03',
runs=100000)
main(mixed_soundfile='input/URMP/AuMix_09_Jesus_tpt_vn.wav',
orig_soundfiles=['input/URMP/AuSep_1_tpt_09_Jesus.wav',
'input/URMP/AuSep_2_vn_09_Jesus.wav'],
out_name='URMP/09',
runs=100000)
main(mixed_soundfile='input/URMP/AuMix_10_March_tpt_sax.wav',
orig_soundfiles=['input/URMP/AuSep_1_tpt_10_March.wav',
'input/URMP/AuSep_2_sax_10_March.wav'],
out_name='URMP/10',
runs=100000)
main(mixed_soundfile='input/URMP/AuMix_11_Maria_ob_vc.wav',
orig_soundfiles=['input/URMP/AuSep_1_ob_11_Maria.wav',
'input/URMP/AuSep_2_vc_11_Maria.wav'],
out_name='URMP/11',
runs=100000)
main(mixed_soundfile='input/URMP/AuMix_17_Nocturne_vn_fl_cl.wav',
orig_soundfiles=['input/URMP/AuSep_1_vn_17_Nocturne.wav',
'input/URMP/AuSep_2_fl_17_Nocturne.wav',
'input/URMP/AuSep_3_cl_17_Nocturne.wav'],
out_name='URMP/17',
runs=100000,
inst_num=3)
main(mixed_soundfile='input/URMP/AuMix_18_Nocturne_vn_fl_tpt.wav',
orig_soundfiles=['input/URMP/AuSep_1_vn_18_Nocturne.wav',
'input/URMP/AuSep_2_fl_18_Nocturne.wav',
'input/URMP/AuSep_3_tpt_18_Nocturne.wav'],
out_name='URMP/18',
runs=100000,
inst_num=3)
[docs]def separate_frere_jacques():
"""
Separation of Bb tin whistle and viola and generalization to
C tin whistle and violin, then vice versa.
"""
inst_dicts = main(mixed_soundfile='input/fj/bb.wav',
orig_soundfiles=['input/fj/bb-tw.wav',
'input/fj/bb-viola.wav'],
out_name='fj/bb',
runs=100000)
main(mixed_soundfile='input/fj/c.wav',
orig_soundfiles=['input/fj/c-tw.wav',
'input/fj/c-violin.wav'],
out_name='fj/c',
out_name_run_suffix='-gen',
runs=100000,
supply_dicts=inst_dicts)
inst_dicts = main(mixed_soundfile='input/fj/c.wav',
orig_soundfiles=['input/fj/c-tw.wav',
'input/fj/c-violin.wav'],
out_name='fj/c',
runs=100000)
main(mixed_soundfile='input/fj/bb.wav',
orig_soundfiles=['input/fj/bb-tw.wav',
'input/fj/bb-viola.wav'],
out_name='fj/bb',
out_name_run_suffix='-gen',
supply_dicts=inst_dicts,
runs=100000)
[docs]def separate_jaiswal(number):
"""
Separation of the data by Jaiswal et al.
Parameters
----------
number : int
Number of the sample to be considered.
"""
main(mixed_soundfile='input/jaiswal/test{}.wav'.format(number),
orig_soundfiles=['input/jaiswal/test{}-01.wav'.format(number),
'input/jaiswal/test{}-02.wav'.format(number)],
out_name='jaiswal/jaiswal{}'.format(number))
[docs]def separate_duan():
"""
Separation of the data by Duan et al.
Parameters
----------
number : int
Number of the sample to be considered.
"""
main(mixed_soundfile='input/duan/Euphonium_Oboe.wav',
orig_soundfiles=['input/duan/Oboe.wav',
'input/duan/Euphonium.wav'],
out_name='duan/eo')
main(mixed_soundfile='input/duan/dyrcj_wqyn.wav',
orig_soundfiles=['input/duan/dyrcj_piccolo.wav',
'input/duan/wqyn_organ.wav'],
out_name='duan/po')
main(mixed_soundfile='input/duan/dyrcj_wqyn_fywz.wav',
orig_soundfiles=['input/duan/dyrcj_piccolo.wav',
'input/duan/wqyn_organ.wav',
'input/duan/fywz_oboe.wav'],
out_name='duan/poo',
inst_num=3)
if __name__ == '__main__':
separate_mozart_recorder_violin()
separate_mozart_recorder_violin_mel()
separate_mozart_clarinet_piano()
separate_mozart_piano_mock()
separate_frere_jacques()
separate_urmp()
# The number of the sample is given via command line.
# Unfortunately, we cannot distribute the data.
#separate_jaiswal(int(sys.argv[1]))
# Get the data from:
# https://sites.google.com/site/mperesult/musicseparationresults
# Upsample to 44100 Hz.
#separate_duan()