#!python3
"""
Module to generate spectograms, save them as images and resynthesize audio.
When invoked, a side-by-side comparison of the spectrograms from the
different methods is performed.
"""
from __future__ import absolute_import, division, print_function
import numpy as np
import tensorflow as tf
import scipy.fftpack as fftpack
import pyfftw
import pyfftw.interfaces.scipy_fftpack as fftpack
import imageio
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from . import wav
from . import specttool
from ..dictsep import pursuit
from ..dictsep import dictlearn
[docs]def gauss(x, stdev, normalize=True):
"""
Generate a Gaussian window/kernel with mean 0.
Parameters
----------
x : array_like
Points to evaluate the Gaussian
stdev : float
Standard deviation
normalize : bool
Whether to l1-normalize the Gaussian
Returns
-------
window : ndarray
Gaussian window/kernel
"""
x = np.asarray(x)
window = np.exp(- np.square(x / stdev) / 2)
if normalize:
return window / np.sum(window)
else:
return window
[docs]def spectwrite(filename, spectrogram, color="viridis", db=100):
"""
Save a spectrogram as an image. The data is normalized to dynamic range
of 100 dB, and a logarithmic Viridis color scale is used.
Parameters
----------
filename : string
Name of the image file
spectrogram : array_like
Spectrogram
color : string or NoneType
Whether to make a color plot
"""
spectrogram = np.asarray(spectrogram)
print("spect energy: {}".format(np.sum(np.square(spectrogram))))
print("spect max: {}".format(np.amax(spectrogram)))
spectrogram = spectrogram / np.amax(spectrogram)
floor = 10**(-db/20)
logspect = np.log10((floor + spectrogram) / (floor + 1)) + db/20
maxspect = np.amax(logspect)
scalespect = logspect / maxspect
scalespect = scalespect[::-1, :]
scalespect = np.maximum(scalespect, 0)
scalespect = np.minimum(scalespect, 1)
if not color:
imageio.imwrite(filename, 1 - scalespect)
elif color == "magma":
plotspect = cm.magma(1 - scalespect)
imageio.imwrite(filename, plotspect)
else:
plotspect = cm.viridis(scalespect)
imageio.imwrite(filename, plotspect)
[docs]def stripe(signal, spectheight, sigmas, sampdist, eval_range):
"""
Populate an array with time-shifted and windowed versions of an audio
signal. This serves as a precursor for FFT calculation. The first
spectrogram time frame coincides with the first sample in the signal.
Out-of-bounds array entries are assumed as zero.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
eval_range : slice
Time range of the spectrogram to be computed
Returns
-------
stripeplot : ndarray
Populated array
"""
signal = np.asarray(signal)
pos = np.arange(0, signal.size, sampdist)[eval_range]
stripeplot = pyfftw.zeros_aligned((spectheight*2, pos.size), order='F',
dtype='complex128')
window = gauss(np.arange(-spectheight, spectheight), spectheight / sigmas)
for i in range(pos.size):
p = pos[i]
lo = max(0, spectheight - p)
hi = min(2*spectheight, signal.size - p + spectheight)
winsig = np.zeros(2*spectheight)
winsig[lo:hi] = (signal[p-spectheight+lo:p-spectheight+hi]
* window[lo:hi])
stripeplot[:,i] = winsig
return stripeplot
[docs]def stft(signal, length, sigmas, sampdist, eval_range=slice(None, None)):
"""
Calculate the linear-frequency spectrogram of a given audio signal
by calling `stripe` and computing the FFT along the first axis.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
eval_range : slice
Time range of the spectrogram to be computed
Returns
-------
spectrogram : ndarray
Complex-valued linear-frequency spectrogram
"""
stripeplot = stripe(signal, length, sigmas, sampdist, eval_range)
fft_object = pyfftw.builders.fft(stripeplot, axis=0, threads=8,
overwrite_input=False, avoid_copy=True)
return fft_object()
[docs]def istft(spect, siglen, sigmas, sampdist):
"""
Reconstruct an audio signal from a complex-valued linear-frequency
spectrogram via orthogonal projection. If a sample cannot be inferred
from the spectrogram, it is set to zero.
Parameters
----------
spect : array_like
Complex-valued linear-frequency spectrogram
siglen : int
Intended length of the audio signal
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
Returns
-------
signal : ndarray
Reconstructed audio signal
"""
spect = np.asarray(spect)
spectheight = spect.shape[0]
fft_object = pyfftw.builders.irfft(spect, n=spectheight*2,
axis=0, threads=8,
overwrite_input=False)
stripeplot = fft_object()
signal = np.zeros(siglen)
window = gauss(np.arange(-spectheight, spectheight), spectheight / sigmas)
return np.asarray(specttool.unstripe(stripeplot, window, siglen,
spectheight, stripeplot.shape[1],
sampdist))
[docs]def spectrogram(signal, spectheight, sigmas, sampdist,
eval_range=slice(None, None)):
"""
Calculate the linear-frequency magnitude spectrogram via STFT.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
eval_range : slice
Time range of the spectrogram to be computed
Returns
-------
spectrogram : ndarray
Linear-frequency magnitude spectrogram
"""
spect = np.abs(stft(signal, spectheight, sigmas, sampdist, eval_range))
return spect
[docs]def synth_audio(spect, siglen, sigmas, sampdist, iterations,
guess=None, size=2000):
"""
Reconstruct an audio signal from a linear-frequency magnitude
spectrogram via the algorithm by Griffin and Lim.
Parameters
----------
spect : array_like
Linear-frequency magnitude spectrogram
siglen : int
Intended length of the audio signal
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
iterations : int
Number of Griffin-Lim iterations to perform
guess : array_like
Initial value for the audio signal
size : int
Batch size for the FFT
Returns
-------
signal : ndarray
Reconstructed audio signal
"""
spect = np.asarray(spect)
spectheight = spect.shape[0]
origspect = spect
if guess is None:
sigold = np.zeros(siglen)
else:
sigold = np.asarray(guess)
throwaway = ((spectheight-1) // sampdist + 1) * 2
for k in range(iterations):
print("Griffin & Lim: iteration {}".format(k))
signew = np.zeros(siglen)
pos = np.arange(0, siglen, sampdist)
lo = 0
cutlo = 0
for j in np.arange(0, pos.size, size):
cutlo = pos[j]
if j >= throwaway:
lo = pos[j - throwaway]
else:
lo = pos[0]
if pos.size > j + size + throwaway:
hi = pos[j + size + throwaway]
else:
hi = siglen
if pos.size > j + size:
cuthi = pos[j + size]
else:
cuthi = hi
spect = stft(sigold[lo:hi], spectheight,
sigmas, sampdist)[:spectheight]
specttool.adapt_mag(
spect, origspect[:, (lo//sampdist):((hi-1)//sampdist+1)],
*spect.shape)
frag = istft(spect, hi-lo, sigmas, sampdist)
signew[cutlo:cuthi] = frag[cutlo-lo:cuthi-lo]
sigold = signew
return signew, spect[:spectheight, :]
[docs]def project_audio(spect, siglen, sigmas, sampdist, size=2000):
"""
Reconstruct an audio signal from a linear-frequency complex
spectrogram via orthogonal projection.
Parameters
----------
spect : array_like
Linear-frequency magnitude spectrogram
siglen : int
Intended length of the audio signal
sigmas : float
Number of standard deviations after which to cut the window
sampdist : int
Time intervals to sample the spectrogram
size : int
Batch size for the FFT
Returns
-------
signal : ndarray
Reconstructed audio signal
"""
spect = np.asarray(spect)
spectheight = spect.shape[0]
origspect = spect
throwaway = ((spectheight-1) // sampdist + 1) * 2
signal = np.zeros(siglen)
pos = np.arange(0, siglen, sampdist)
lo = 0
cutlo = 0
for j in np.arange(0, pos.size, size):
cutlo = pos[j]
if j >= throwaway:
lo = pos[j - throwaway]
else:
lo = pos[0]
if pos.size > j + size + throwaway:
hi = pos[j + size + throwaway]
else:
hi = siglen
if pos.size > j + size:
cuthi = pos[j + size]
else:
cuthi = hi
spect = origspect[:, (lo//sampdist):((hi-1)//sampdist+1)]
frag = istft(spect, hi-lo, sigmas, sampdist)
signal[cutlo:cuthi] = frag[cutlo-lo:cuthi-lo]
return signal
[docs]def winlog_spect(spect, freqs, basefreq, sigmas, scale=True):
"""
Apply a logarithmic transform on the frequency axis of a linear-frequency
magnitude spectrogram while preserving the width of the horizontal lines
via Gaussian smoothing. The attenuation of the higher frequency is
counteracted by scaling.
Parameters
----------
spect : array_like
Linear-frequency magnitude spectrogram
freqs : array_like
Frequencies to place the smoothing kernel
(normalized to the sampling frequency)
basefreq : float
Frequency to assume as a minimum for smoothing
(normalized to the sampling frequency)
sigmas : float
Number of standard deviations after which to cut the kernel
scale : bool
Whether to adjust the l1 norm of the kernels w.r.t. frequency
Returns
-------
logspect : ndarray
Log-frequency magnitude spectrogram
"""
spect = np.asarray(spect)
freqs = np.asarray(freqs) * (2 * spect.shape[0])
basefreq = basefreq * (2 * spect.shape[0])
logspect = np.zeros((freqs.size, spect.shape[1]))
for i in range(freqs.size):
freq = freqs[i]
freqint = int(np.round(freq))
freqdiff = freq - freqint
stretch = freq / basefreq
if stretch > 1 + 1e-12:
sigma = sigmas/np.pi * np.sqrt(np.square(stretch) - 1) / np.sqrt(2)
width = int(np.ceil(sigma * sigmas))
lo = max(0, width - freqint)
hi = min(2*width, spect.shape[0] - freqint + width)
window = gauss(np.arange(lo-width, hi-width) - freqdiff, sigma)
if scale:
window *= stretch
logspectrow = np.sum(window.reshape(hi-lo, 1)
* spect[freqint-width+lo:freqint-width+hi, :],
axis=0)
else:
logspectrow = spect[freqint,:]
logspect[i,:] = logspectrow
return logspect
[docs]def logspect_mel(signal, spectheight, sigmas, sampdist, basefreq,
minfreq, maxfreq, numfreqs, eval_range=slice(None, None),
scale=True):
"""
Compute the Mel-frequency spectrogram of an audio signal.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window/kernel
sampdist : int
Time intervals to sample the spectrogram
basefreq : float
Frequency to assume as a minimum for smoothing
(normalized to the sampling frequency)
minfreq : float
Minimum frequency to be represented (included)
(normalized to the sampling frequency)
maxfreq : float
Maximum frequency to be represented (excluded)
(normalized to the sampling frequency)
numfreqs : float
Height of the log-frequency spectrogram
eval_range : slice
Time range of the spectrogram to be computed
scale : bool
Whether to adjust the l1 norm of the kernels w.r.t. frequency
Returns
-------
logspect : ndarray
Log-frequency magnitude spectrogram
spect : ndarray
Linear-frequency magnitude spectrogram
"""
signal = np.asarray(signal)
freqs = np.logspace(np.log10(minfreq), np.log10(maxfreq), numfreqs,
endpoint=False)
spect = (spectrogram(signal, spectheight, sigmas, sampdist, eval_range)
[:spectheight, :]**2)
logspect = winlog_spect(spect, freqs, basefreq, sigmas, scale)
return logspect, spect
[docs]def smoothconv(signal, window, kernel, sampdist):
signal = tf.convert_to_tensor(signal, dtype=tf.float64)
window = tf.convert_to_tensor(window, dtype=tf.float64)
kernel = tf.convert_to_tensor(kernel, dtype=tf.float64)
row = tf.nn.conv2d(tf.reshape(signal, [1, 1, -1, 1]),
tf.reshape(window, [1, -1, 1, 2]),
[1, 1, 1, 1], 'SAME')
y = tf.nn.conv2d(tf.sqrt(tf.square(row[:,:,:,0:1])
+ tf.square(row[:,:,:,1:2])),
tf.reshape(kernel, [1, -1, 1, 1]),
[1, 1, sampdist, 1], 'SAME')
return y
[docs]def logspect_cq(signal, spectheight, sigmas, sampdist, basefreq,
minfreq, maxfreq, numfreqs, smooth=True):
"""
Compute the time-smoothed CQT of an audio signal.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window/kernel
sampdist : int
Time intervals to sample the spectrogram
basefreq : float
Frequency to assume as a minimum for smoothing
(normalized to the sampling frequency)
minfreq : float
Minimum frequency to be represented (included)
(normalized to the sampling frequency)
maxfreq : float
Maximum frequency to be represented (excluded)
(normalized to the sampling frequency)
numfreqs : float
Height of the log-frequency spectrogram
Returns
-------
logspect : ndarray
Log-frequency magnitude spectrogram
"""
signal = np.asarray(signal)
freqs = np.logspace(np.log10(minfreq), np.log10(maxfreq), numfreqs,
endpoint=False)
samppos = np.arange(0, signal.size, sampdist)
spect = np.zeros((numfreqs, samppos.size))
for i in range(freqs.size):
freq = freqs[i]
print("freq {}: {}".format(i, freq))
stretch = basefreq / freq
sigma = spectheight * stretch
sigmalen = int(np.ceil(sigma))
window = gauss(np.arange(-sigmalen, sigmalen+1), sigma / sigmas)
phase = np.exp(-1j*2*np.pi * freq * np.arange(-sigmalen, sigmalen+1))
window = window * phase
if stretch >= 1 - 1e-6 or not smooth:
convwindow = [1]
else:
sigmaadd = spectheight * np.sqrt(1 - np.square(stretch))
sigmaaddlen = int(np.ceil(sigmaadd))
convwindow = gauss(np.arange(-sigmaaddlen, sigmaaddlen+1),
sigmaadd / sigmas)
y = smoothconv(signal,
np.vstack((np.real(window),
np.imag(window))).T,
convwindow,
sampdist)
spect[i,:] = np.ravel(y)
return spect
[docs]def logspect_pursuit(signal, spectheight, sigmas, sampdist, basefreq,
minfreq, maxfreq, numfreqs, fsigma,
eval_range=slice(None, None)):
"""
Compute the log-frequency frequency via sparse pursuit.
Parameters
----------
signal : array_like
Audio signal
spectheight : int
Height of the linear-frequency spectrogram
sigmas : float
Number of standard deviations after which to cut the window/kernel
sampdist : int
Time intervals to sample the spectrogram
basefreq : float
Frequency to assume as a minimum for smoothing
(normalized to the sampling frequency)
minfreq : float
Minimum frequency to be represented (included)
(normalized to the sampling frequency)
maxfreq : float
Maximum frequency to be represented (excluded)
(normalized to the sampling frequency)
numfreqs : float
Height of the log-frequency spectrogram
fsigma : float
Standard deviation (frequency)
eval_range : slice
Time range of the spectrogram to be computed
Returns
-------
logspect : ndarray
Log-frequency magnitude spectrogram
"""
signal = np.asarray(signal)
minfreq = minfreq * (2 * spectheight)
maxfreq = maxfreq * (2 * spectheight)
spect = (spectrogram(signal, spectheight, sigmas, sampdist)
[:spectheight, eval_range])
logspect = np.zeros((numfreqs, spect.shape[1]))
linspect = np.zeros(spect.shape)
print("timeslots: {}".format(spect.shape[1]))
inst_dict = np.asarray([[1.]])
init = None
if basefreq is None:
stretch = 1
else:
basefreq = basefreq * (2 * spectheight)
stretch = numfreqs / np.log(maxfreq/minfreq) / basefreq
print("stretch: {}".format(stretch))
pexp = 1
qexp = 1
make_bounds, make_inits = dictlearn.make_closures(fsigma)
# setting harscale = 0 is a hack to make the gradient w.r.t. spread zero
fixed_params = (inst_dict, 0)
for i in range(spect.shape[1]):
print("Timeslot {}".format(i))
y = spect[:,i]
peaks, reconstruction = pursuit.peak_pursuit(y, 1000, 1000, 20, 1,
pursuit.inst_shift,
pursuit.inst_shift_obj,
pursuit.inst_shift_grad,
make_bounds, make_inits,
fixed_params,
pursuit.max_selector,
(5,), pexp, qexp)
logshifts = np.ones(len(peaks)) * (-np.inf)
idcs, = np.where(peaks.shifts > 0)
logshifts[idcs] = (numfreqs / np.log(maxfreq / minfreq)
* np.log(peaks.shifts[idcs] / minfreq))
peaks.shifts = logshifts
peaks.params[0, :] = peaks.params[0, :] * stretch
logspect[:, i] = pursuit.inst_shift(peaks, fixed_params, pexp, numfreqs)
linspect[:, i] = reconstruction
return logspect, linspect
[docs]def example_delta_octaves():
"""
Comparison of the different representations with a delta transient and
sinusoids.
"""
n = 200000
signal = np.zeros(n)
signal += (np.sin(2*np.pi * 1000/48000 * np.arange(0, n)) +
np.sin(2*np.pi * 2000/48000 * np.arange(0, n)) +
np.sin(2*np.pi * 4000/48000 * np.arange(0, n)) +
np.sin(2*np.pi * 8000/48000 * np.arange(0, n)) +
np.sin(2*np.pi * 16000/48000 * np.arange(0, n)))
signal[100000] = 2*np.sqrt(2*np.pi)/6*1024
spectheight = 1024*6
spect = spectrogram(signal, spectheight, 6, 128)
spectwrite('delta+octaves_lin.png', spect[:spectheight, :],
color=False, db=30)
spect = logspect_mel(signal, spectheight, 6, 128, 640/48000,
640/48000, 20480/48000, 1024, scale=True)[0]
spectwrite('delta+octaves_mel_pi.png', np.sqrt(spect),
color=False, db=30)
spect = logspect_mel(signal, spectheight, 6, 128, 640/48000,
640/48000, 20480/48000, 1024, scale=False)[0]
spectwrite('delta+octaves_mel_fu.png', np.sqrt(spect),
color=False, db=30)
spect = logspect_cq(signal, spectheight, 6, 128, 640/48000,
640/48000, 20480/48000, 1024, False)
spectwrite('delta+octave_cq.png', spect,
color=False, db=60)
spect = logspect_cq(signal, spectheight, 6, 128, 640/48000,
640/48000, 20480/48000, 1024)
spectwrite('delta+octave_cq_smooth.png', spect,
color=False, db=30)
[docs]def example_delta_scale():
"""
Display of the properties of the smoothed CQT with a delta transient and a
chromatic scale of sinusoids.
"""
n = 160000
timedist = 10000
signal = np.zeros(n)
for i in range(13):
tone = np.sin(2*np.pi*np.arange(timedist)*880*2**(i/12)/48000)
tone[:1000] *= np.hanning(2000)[:1000]
tone[9000:] *= np.hanning(2000)[1000:]
signal[timedist*(i+2):timedist*(i+3)] = tone
signal[10000] = 2*np.sqrt(2*np.pi)/6*1024
spect = logspect_cq(signal, 1024*6, 6, 128, 640/48000,
640/48000, 2560/48000, 900)
spectwrite('delta+scale_cq_smooth.png', spect, None, db=40)
[docs]def example_brahms():
"""
Application of different transforms on a recording of the 1st violin sonata
of Johannes Brahms.
"""
signal = wav.read('input/brahms1.wav')[0][44100:44100*11]
spectheight = 1024*6
spect = logspect_pursuit(signal, spectheight, 6, 256, None,
20/48000, 20480/48000, 1024, 6/np.pi)[0]
spectwrite('brahms_pursuit.png', spect, None)
basefreq = 1024/np.log(1024)/(12*1024)
spect = logspect_mel(signal, spectheight, 6, 256, basefreq,
basefreq, 20480/48000, 527)[0]
spectwrite('brahms_mel.png', np.sqrt(spect), None)
spect = logspect_cq(signal, spectheight, 6, 256, basefreq,
20/48000, 20480/48000, 1024, False)
spectwrite('brahms_cq.png', spect, None)
[docs]def example_mozart():
"""
Application of the sparse pursuit method on the individual instrument
tracks of the piece by Mozart.
"""
spectheight = 1024*6
basefreq = 1024/np.log(1024)/(12*1024)
for (fi,fm,fs) in [('input/mozart/recorder.wav', 'recorder_mel.png',
'recorder_sparse.png'),
('input/mozart/violin.wav', 'violin_mel.png',
'violin_sparse.png'),
('output/mozart/mozart-7-synth0-mask.wav',
'recorder_post_mel.png',
'recorder_post_sparse.png'),
('output/mozart/mozart-7-synth1-mask.wav',
'violin_post_mel.png',
'violin_post_sparse.png'),
('output/mozart/mozart-7-synth-mask.wav',
'mix_post_mel.png',
'mix_post_sparse.png')]:
signal = wav.read(fi)[0]
spect = logspect_mel(signal, spectheight, 6, 256, basefreq,
basefreq, 20480/48000, 527,
eval_range=slice(0, 1580))[0]
spectwrite(fm, np.sqrt(spect), None)
spect = logspect_pursuit(signal, spectheight, 6, 256, None,
20/48000, 20480/48000, 1024, 6/np.pi,
eval_range=slice(0, 1580))[0]
spectwrite(fs, spect, None)
[docs]def example_beethoven():
signal = wav.read('input/beethoven.wav')[0]
spectheight = 1024*6
spect = spectrogram(signal, spectheight, 6, 128)
spectwrite('beethoven_lin.png', spect[:spectheight//4, :],
color=False, db=50)
if __name__ == '__main__':
example_delta_octaves()
example_delta_scale()
example_brahms()
example_mozart()
example_beethoven()