Source code for musisep.audio.spect

#!python3

"""
Module to generate spectograms, save them as images and resynthesize audio.
When invoked, a side-by-side comparison of the spectrograms from the
different methods is performed.
"""

from __future__ import absolute_import, division, print_function

import numpy as np
import tensorflow as tf
import scipy.fftpack as fftpack
import pyfftw
import pyfftw.interfaces.scipy_fftpack as fftpack
import imageio
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from . import wav
from . import specttool
from ..dictsep import pursuit
from ..dictsep import dictlearn

[docs]def gauss(x, stdev, normalize=True): """ Generate a Gaussian window/kernel with mean 0. Parameters ---------- x : array_like Points to evaluate the Gaussian stdev : float Standard deviation normalize : bool Whether to l1-normalize the Gaussian Returns ------- window : ndarray Gaussian window/kernel """ x = np.asarray(x) window = np.exp(- np.square(x / stdev) / 2) if normalize: return window / np.sum(window) else: return window
[docs]def spectwrite(filename, spectrogram, color="viridis", db=100): """ Save a spectrogram as an image. The data is normalized to dynamic range of 100 dB, and a logarithmic Viridis color scale is used. Parameters ---------- filename : string Name of the image file spectrogram : array_like Spectrogram color : string or NoneType Whether to make a color plot """ spectrogram = np.asarray(spectrogram) print("spect energy: {}".format(np.sum(np.square(spectrogram)))) print("spect max: {}".format(np.amax(spectrogram))) spectrogram = spectrogram / np.amax(spectrogram) floor = 10**(-db/20) logspect = np.log10((floor + spectrogram) / (floor + 1)) + db/20 maxspect = np.amax(logspect) scalespect = logspect / maxspect scalespect = scalespect[::-1, :] scalespect = np.maximum(scalespect, 0) scalespect = np.minimum(scalespect, 1) if not color: imageio.imwrite(filename, 1 - scalespect) elif color == "magma": plotspect = cm.magma(1 - scalespect) imageio.imwrite(filename, plotspect) else: plotspect = cm.viridis(scalespect) imageio.imwrite(filename, plotspect)
[docs]def stripe(signal, spectheight, sigmas, sampdist, eval_range): """ Populate an array with time-shifted and windowed versions of an audio signal. This serves as a precursor for FFT calculation. The first spectrogram time frame coincides with the first sample in the signal. Out-of-bounds array entries are assumed as zero. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram eval_range : slice Time range of the spectrogram to be computed Returns ------- stripeplot : ndarray Populated array """ signal = np.asarray(signal) pos = np.arange(0, signal.size, sampdist)[eval_range] stripeplot = pyfftw.zeros_aligned((spectheight*2, pos.size), order='F', dtype='complex128') window = gauss(np.arange(-spectheight, spectheight), spectheight / sigmas) for i in range(pos.size): p = pos[i] lo = max(0, spectheight - p) hi = min(2*spectheight, signal.size - p + spectheight) winsig = np.zeros(2*spectheight) winsig[lo:hi] = (signal[p-spectheight+lo:p-spectheight+hi] * window[lo:hi]) stripeplot[:,i] = winsig return stripeplot
[docs]def stft(signal, length, sigmas, sampdist, eval_range=slice(None, None)): """ Calculate the linear-frequency spectrogram of a given audio signal by calling `stripe` and computing the FFT along the first axis. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram eval_range : slice Time range of the spectrogram to be computed Returns ------- spectrogram : ndarray Complex-valued linear-frequency spectrogram """ stripeplot = stripe(signal, length, sigmas, sampdist, eval_range) fft_object = pyfftw.builders.fft(stripeplot, axis=0, threads=8, overwrite_input=False, avoid_copy=True) return fft_object()
[docs]def istft(spect, siglen, sigmas, sampdist): """ Reconstruct an audio signal from a complex-valued linear-frequency spectrogram via orthogonal projection. If a sample cannot be inferred from the spectrogram, it is set to zero. Parameters ---------- spect : array_like Complex-valued linear-frequency spectrogram siglen : int Intended length of the audio signal sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram Returns ------- signal : ndarray Reconstructed audio signal """ spect = np.asarray(spect) spectheight = spect.shape[0] fft_object = pyfftw.builders.irfft(spect, n=spectheight*2, axis=0, threads=8, overwrite_input=False) stripeplot = fft_object() signal = np.zeros(siglen) window = gauss(np.arange(-spectheight, spectheight), spectheight / sigmas) return np.asarray(specttool.unstripe(stripeplot, window, siglen, spectheight, stripeplot.shape[1], sampdist))
[docs]def spectrogram(signal, spectheight, sigmas, sampdist, eval_range=slice(None, None)): """ Calculate the linear-frequency magnitude spectrogram via STFT. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram eval_range : slice Time range of the spectrogram to be computed Returns ------- spectrogram : ndarray Linear-frequency magnitude spectrogram """ spect = np.abs(stft(signal, spectheight, sigmas, sampdist, eval_range)) return spect
[docs]def synth_audio(spect, siglen, sigmas, sampdist, iterations, guess=None, size=2000): """ Reconstruct an audio signal from a linear-frequency magnitude spectrogram via the algorithm by Griffin and Lim. Parameters ---------- spect : array_like Linear-frequency magnitude spectrogram siglen : int Intended length of the audio signal sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram iterations : int Number of Griffin-Lim iterations to perform guess : array_like Initial value for the audio signal size : int Batch size for the FFT Returns ------- signal : ndarray Reconstructed audio signal """ spect = np.asarray(spect) spectheight = spect.shape[0] origspect = spect if guess is None: sigold = np.zeros(siglen) else: sigold = np.asarray(guess) throwaway = ((spectheight-1) // sampdist + 1) * 2 for k in range(iterations): print("Griffin & Lim: iteration {}".format(k)) signew = np.zeros(siglen) pos = np.arange(0, siglen, sampdist) lo = 0 cutlo = 0 for j in np.arange(0, pos.size, size): cutlo = pos[j] if j >= throwaway: lo = pos[j - throwaway] else: lo = pos[0] if pos.size > j + size + throwaway: hi = pos[j + size + throwaway] else: hi = siglen if pos.size > j + size: cuthi = pos[j + size] else: cuthi = hi spect = stft(sigold[lo:hi], spectheight, sigmas, sampdist)[:spectheight] specttool.adapt_mag( spect, origspect[:, (lo//sampdist):((hi-1)//sampdist+1)], *spect.shape) frag = istft(spect, hi-lo, sigmas, sampdist) signew[cutlo:cuthi] = frag[cutlo-lo:cuthi-lo] sigold = signew return signew, spect[:spectheight, :]
[docs]def project_audio(spect, siglen, sigmas, sampdist, size=2000): """ Reconstruct an audio signal from a linear-frequency complex spectrogram via orthogonal projection. Parameters ---------- spect : array_like Linear-frequency magnitude spectrogram siglen : int Intended length of the audio signal sigmas : float Number of standard deviations after which to cut the window sampdist : int Time intervals to sample the spectrogram size : int Batch size for the FFT Returns ------- signal : ndarray Reconstructed audio signal """ spect = np.asarray(spect) spectheight = spect.shape[0] origspect = spect throwaway = ((spectheight-1) // sampdist + 1) * 2 signal = np.zeros(siglen) pos = np.arange(0, siglen, sampdist) lo = 0 cutlo = 0 for j in np.arange(0, pos.size, size): cutlo = pos[j] if j >= throwaway: lo = pos[j - throwaway] else: lo = pos[0] if pos.size > j + size + throwaway: hi = pos[j + size + throwaway] else: hi = siglen if pos.size > j + size: cuthi = pos[j + size] else: cuthi = hi spect = origspect[:, (lo//sampdist):((hi-1)//sampdist+1)] frag = istft(spect, hi-lo, sigmas, sampdist) signal[cutlo:cuthi] = frag[cutlo-lo:cuthi-lo] return signal
[docs]def winlog_spect(spect, freqs, basefreq, sigmas, scale=True): """ Apply a logarithmic transform on the frequency axis of a linear-frequency magnitude spectrogram while preserving the width of the horizontal lines via Gaussian smoothing. The attenuation of the higher frequency is counteracted by scaling. Parameters ---------- spect : array_like Linear-frequency magnitude spectrogram freqs : array_like Frequencies to place the smoothing kernel (normalized to the sampling frequency) basefreq : float Frequency to assume as a minimum for smoothing (normalized to the sampling frequency) sigmas : float Number of standard deviations after which to cut the kernel scale : bool Whether to adjust the l1 norm of the kernels w.r.t. frequency Returns ------- logspect : ndarray Log-frequency magnitude spectrogram """ spect = np.asarray(spect) freqs = np.asarray(freqs) * (2 * spect.shape[0]) basefreq = basefreq * (2 * spect.shape[0]) logspect = np.zeros((freqs.size, spect.shape[1])) for i in range(freqs.size): freq = freqs[i] freqint = int(np.round(freq)) freqdiff = freq - freqint stretch = freq / basefreq if stretch > 1 + 1e-12: sigma = sigmas/np.pi * np.sqrt(np.square(stretch) - 1) / np.sqrt(2) width = int(np.ceil(sigma * sigmas)) lo = max(0, width - freqint) hi = min(2*width, spect.shape[0] - freqint + width) window = gauss(np.arange(lo-width, hi-width) - freqdiff, sigma) if scale: window *= stretch logspectrow = np.sum(window.reshape(hi-lo, 1) * spect[freqint-width+lo:freqint-width+hi, :], axis=0) else: logspectrow = spect[freqint,:] logspect[i,:] = logspectrow return logspect
[docs]def logspect_mel(signal, spectheight, sigmas, sampdist, basefreq, minfreq, maxfreq, numfreqs, eval_range=slice(None, None), scale=True): """ Compute the Mel-frequency spectrogram of an audio signal. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window/kernel sampdist : int Time intervals to sample the spectrogram basefreq : float Frequency to assume as a minimum for smoothing (normalized to the sampling frequency) minfreq : float Minimum frequency to be represented (included) (normalized to the sampling frequency) maxfreq : float Maximum frequency to be represented (excluded) (normalized to the sampling frequency) numfreqs : float Height of the log-frequency spectrogram eval_range : slice Time range of the spectrogram to be computed scale : bool Whether to adjust the l1 norm of the kernels w.r.t. frequency Returns ------- logspect : ndarray Log-frequency magnitude spectrogram spect : ndarray Linear-frequency magnitude spectrogram """ signal = np.asarray(signal) freqs = np.logspace(np.log10(minfreq), np.log10(maxfreq), numfreqs, endpoint=False) spect = (spectrogram(signal, spectheight, sigmas, sampdist, eval_range) [:spectheight, :]**2) logspect = winlog_spect(spect, freqs, basefreq, sigmas, scale) return logspect, spect
[docs]def smoothconv(signal, window, kernel, sampdist): signal = tf.convert_to_tensor(signal, dtype=tf.float64) window = tf.convert_to_tensor(window, dtype=tf.float64) kernel = tf.convert_to_tensor(kernel, dtype=tf.float64) row = tf.nn.conv2d(tf.reshape(signal, [1, 1, -1, 1]), tf.reshape(window, [1, -1, 1, 2]), [1, 1, 1, 1], 'SAME') y = tf.nn.conv2d(tf.sqrt(tf.square(row[:,:,:,0:1]) + tf.square(row[:,:,:,1:2])), tf.reshape(kernel, [1, -1, 1, 1]), [1, 1, sampdist, 1], 'SAME') return y
[docs]def logspect_cq(signal, spectheight, sigmas, sampdist, basefreq, minfreq, maxfreq, numfreqs, smooth=True): """ Compute the time-smoothed CQT of an audio signal. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window/kernel sampdist : int Time intervals to sample the spectrogram basefreq : float Frequency to assume as a minimum for smoothing (normalized to the sampling frequency) minfreq : float Minimum frequency to be represented (included) (normalized to the sampling frequency) maxfreq : float Maximum frequency to be represented (excluded) (normalized to the sampling frequency) numfreqs : float Height of the log-frequency spectrogram Returns ------- logspect : ndarray Log-frequency magnitude spectrogram """ signal = np.asarray(signal) freqs = np.logspace(np.log10(minfreq), np.log10(maxfreq), numfreqs, endpoint=False) samppos = np.arange(0, signal.size, sampdist) spect = np.zeros((numfreqs, samppos.size)) for i in range(freqs.size): freq = freqs[i] print("freq {}: {}".format(i, freq)) stretch = basefreq / freq sigma = spectheight * stretch sigmalen = int(np.ceil(sigma)) window = gauss(np.arange(-sigmalen, sigmalen+1), sigma / sigmas) phase = np.exp(-1j*2*np.pi * freq * np.arange(-sigmalen, sigmalen+1)) window = window * phase if stretch >= 1 - 1e-6 or not smooth: convwindow = [1] else: sigmaadd = spectheight * np.sqrt(1 - np.square(stretch)) sigmaaddlen = int(np.ceil(sigmaadd)) convwindow = gauss(np.arange(-sigmaaddlen, sigmaaddlen+1), sigmaadd / sigmas) y = smoothconv(signal, np.vstack((np.real(window), np.imag(window))).T, convwindow, sampdist) spect[i,:] = np.ravel(y) return spect
[docs]def logspect_pursuit(signal, spectheight, sigmas, sampdist, basefreq, minfreq, maxfreq, numfreqs, fsigma, eval_range=slice(None, None)): """ Compute the log-frequency frequency via sparse pursuit. Parameters ---------- signal : array_like Audio signal spectheight : int Height of the linear-frequency spectrogram sigmas : float Number of standard deviations after which to cut the window/kernel sampdist : int Time intervals to sample the spectrogram basefreq : float Frequency to assume as a minimum for smoothing (normalized to the sampling frequency) minfreq : float Minimum frequency to be represented (included) (normalized to the sampling frequency) maxfreq : float Maximum frequency to be represented (excluded) (normalized to the sampling frequency) numfreqs : float Height of the log-frequency spectrogram fsigma : float Standard deviation (frequency) eval_range : slice Time range of the spectrogram to be computed Returns ------- logspect : ndarray Log-frequency magnitude spectrogram """ signal = np.asarray(signal) minfreq = minfreq * (2 * spectheight) maxfreq = maxfreq * (2 * spectheight) spect = (spectrogram(signal, spectheight, sigmas, sampdist) [:spectheight, eval_range]) logspect = np.zeros((numfreqs, spect.shape[1])) linspect = np.zeros(spect.shape) print("timeslots: {}".format(spect.shape[1])) inst_dict = np.asarray([[1.]]) init = None if basefreq is None: stretch = 1 else: basefreq = basefreq * (2 * spectheight) stretch = numfreqs / np.log(maxfreq/minfreq) / basefreq print("stretch: {}".format(stretch)) pexp = 1 qexp = 1 make_bounds, make_inits = dictlearn.make_closures(fsigma) # setting harscale = 0 is a hack to make the gradient w.r.t. spread zero fixed_params = (inst_dict, 0) for i in range(spect.shape[1]): print("Timeslot {}".format(i)) y = spect[:,i] peaks, reconstruction = pursuit.peak_pursuit(y, 1000, 1000, 20, 1, pursuit.inst_shift, pursuit.inst_shift_obj, pursuit.inst_shift_grad, make_bounds, make_inits, fixed_params, pursuit.max_selector, (5,), pexp, qexp) logshifts = np.ones(len(peaks)) * (-np.inf) idcs, = np.where(peaks.shifts > 0) logshifts[idcs] = (numfreqs / np.log(maxfreq / minfreq) * np.log(peaks.shifts[idcs] / minfreq)) peaks.shifts = logshifts peaks.params[0, :] = peaks.params[0, :] * stretch logspect[:, i] = pursuit.inst_shift(peaks, fixed_params, pexp, numfreqs) linspect[:, i] = reconstruction return logspect, linspect
[docs]def example_delta_octaves(): """ Comparison of the different representations with a delta transient and sinusoids. """ n = 200000 signal = np.zeros(n) signal += (np.sin(2*np.pi * 1000/48000 * np.arange(0, n)) + np.sin(2*np.pi * 2000/48000 * np.arange(0, n)) + np.sin(2*np.pi * 4000/48000 * np.arange(0, n)) + np.sin(2*np.pi * 8000/48000 * np.arange(0, n)) + np.sin(2*np.pi * 16000/48000 * np.arange(0, n))) signal[100000] = 2*np.sqrt(2*np.pi)/6*1024 spectheight = 1024*6 spect = spectrogram(signal, spectheight, 6, 128) spectwrite('delta+octaves_lin.png', spect[:spectheight, :], color=False, db=30) spect = logspect_mel(signal, spectheight, 6, 128, 640/48000, 640/48000, 20480/48000, 1024, scale=True)[0] spectwrite('delta+octaves_mel_pi.png', np.sqrt(spect), color=False, db=30) spect = logspect_mel(signal, spectheight, 6, 128, 640/48000, 640/48000, 20480/48000, 1024, scale=False)[0] spectwrite('delta+octaves_mel_fu.png', np.sqrt(spect), color=False, db=30) spect = logspect_cq(signal, spectheight, 6, 128, 640/48000, 640/48000, 20480/48000, 1024, False) spectwrite('delta+octave_cq.png', spect, color=False, db=60) spect = logspect_cq(signal, spectheight, 6, 128, 640/48000, 640/48000, 20480/48000, 1024) spectwrite('delta+octave_cq_smooth.png', spect, color=False, db=30)
[docs]def example_delta_scale(): """ Display of the properties of the smoothed CQT with a delta transient and a chromatic scale of sinusoids. """ n = 160000 timedist = 10000 signal = np.zeros(n) for i in range(13): tone = np.sin(2*np.pi*np.arange(timedist)*880*2**(i/12)/48000) tone[:1000] *= np.hanning(2000)[:1000] tone[9000:] *= np.hanning(2000)[1000:] signal[timedist*(i+2):timedist*(i+3)] = tone signal[10000] = 2*np.sqrt(2*np.pi)/6*1024 spect = logspect_cq(signal, 1024*6, 6, 128, 640/48000, 640/48000, 2560/48000, 900) spectwrite('delta+scale_cq_smooth.png', spect, None, db=40)
[docs]def example_brahms(): """ Application of different transforms on a recording of the 1st violin sonata of Johannes Brahms. """ signal = wav.read('input/brahms1.wav')[0][44100:44100*11] spectheight = 1024*6 spect = logspect_pursuit(signal, spectheight, 6, 256, None, 20/48000, 20480/48000, 1024, 6/np.pi)[0] spectwrite('brahms_pursuit.png', spect, None) basefreq = 1024/np.log(1024)/(12*1024) spect = logspect_mel(signal, spectheight, 6, 256, basefreq, basefreq, 20480/48000, 527)[0] spectwrite('brahms_mel.png', np.sqrt(spect), None) spect = logspect_cq(signal, spectheight, 6, 256, basefreq, 20/48000, 20480/48000, 1024, False) spectwrite('brahms_cq.png', spect, None)
[docs]def example_mozart(): """ Application of the sparse pursuit method on the individual instrument tracks of the piece by Mozart. """ spectheight = 1024*6 basefreq = 1024/np.log(1024)/(12*1024) for (fi,fm,fs) in [('input/mozart/recorder.wav', 'recorder_mel.png', 'recorder_sparse.png'), ('input/mozart/violin.wav', 'violin_mel.png', 'violin_sparse.png'), ('output/mozart/mozart-7-synth0-mask.wav', 'recorder_post_mel.png', 'recorder_post_sparse.png'), ('output/mozart/mozart-7-synth1-mask.wav', 'violin_post_mel.png', 'violin_post_sparse.png'), ('output/mozart/mozart-7-synth-mask.wav', 'mix_post_mel.png', 'mix_post_sparse.png')]: signal = wav.read(fi)[0] spect = logspect_mel(signal, spectheight, 6, 256, basefreq, basefreq, 20480/48000, 527, eval_range=slice(0, 1580))[0] spectwrite(fm, np.sqrt(spect), None) spect = logspect_pursuit(signal, spectheight, 6, 256, None, 20/48000, 20480/48000, 1024, 6/np.pi, eval_range=slice(0, 1580))[0] spectwrite(fs, spect, None)
[docs]def example_beethoven(): signal = wav.read('input/beethoven.wav')[0] spectheight = 1024*6 spect = spectrogram(signal, spectheight, 6, 128) spectwrite('beethoven_lin.png', spect[:spectheight//4, :], color=False, db=50)
if __name__ == '__main__': example_delta_octaves() example_delta_scale() example_brahms() example_mozart() example_beethoven()