audio/src/klio_audio/transforms/audio.py (106 lines of code) (raw):

# Copyright 2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import librosa import librosa.display import numpy as np from klio.transforms import decorators as tfm_decorators from klio_audio import decorators from klio_audio.transforms import _base class LoadAudio(_base.KlioAudioBaseDoFn): """Load audio into memory as a :class:`numpy.ndarray`. This transform wraps :func:`librosa.load` takes in a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` with the payload of the ``KlioMessage`` a file-like object or a path to a file, and returns a ``PCollection`` of ``KlioMessages`` where the payload is a :class:`numpy.ndarray`. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() # other transforms ) Args: librosa_kwargs (dict): Instantiate the transform with keyword arguments to pass into :func:`librosa.load`. """ def __init__(self, *_, **librosa_kwargs): self.librosa_kwargs = librosa_kwargs @tfm_decorators._handle_klio @decorators.handle_binary(save_with_numpy=True) def process(self, item): element = item.element.decode("utf-8") self._klio.logger.debug( "Loading {} into memory as a numpy array.".format(element) ) audio, _ = librosa.load(item.payload, **self.librosa_kwargs) yield audio class GetSTFT(_base.KlioAudioBaseDoFn): """Calculate Short-time Fourier transform from a :class:`numpy.ndarray`. This transform wraps :func:`librosa.stft` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` and the output is the same with the ``stft`` calculation applied. The Short-time Fourier transform (STFT) is a Fourier-related transform used to determine the sinusoidal frequency and phase content of local sections of a signal as it changes over time. STFT provides the time-localized frequency information for situations in which frequency components of a signal vary over time, whereas the standard Fourier transform provides the frequency information averaged over the entire signal time interval. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetSTFT # other transforms ) Args: librosa_kwargs (dict): Instantiate the transform with keyword arguments to pass into :func:`librosa.stft`. """ def __init__(self, *_, **librosa_kwargs): self.librosa_kwargs = librosa_kwargs @tfm_decorators._handle_klio @decorators.handle_binary(load_with_numpy=True, save_with_numpy=True) def process(self, item): element = item.element.decode("utf-8") self._klio.logger.debug( "Calculating the short-time Fourier transform for {}".format( element ) ) yield librosa.stft(y=item.payload, **self.librosa_kwargs) class GetSpec(_base.KlioAudioBaseDoFn): """Generate a dB-scaled spectrogram from a :class:`numpy.ndarray`. This transform wraps :func:`librosa.amplitude_to_db` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` and the output is the same with the ``amplitude_to_ db`` function applied. A spectrogram shows the the intensity of frequencies over time. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetSpec() # other transforms ) Args: librosa_kwargs (dict): Instantiate the transform with keyword arguments to pass into :func:`librosa.amplitude_to_db`. """ def __init__(self, *_, **librosa_kwargs): self.librosa_kwargs = librosa_kwargs @tfm_decorators._handle_klio @decorators.handle_binary(load_with_numpy=True, save_with_numpy=True) def process(self, item): element = item.element.decode("utf-8") self._klio.logger.debug( "Generating a spectrogram for {}".format(element) ) stft = item.payload yield librosa.amplitude_to_db( np.abs(stft), ref=np.max(np.abs(stft)), **self.librosa_kwargs ) class GetMelSpec(_base.KlioAudioBaseDoFn): """Generate a spectrogram from a :class:`numpy.ndarray` using the mel scale. This transform wraps :func:`librosa.feature.melspectrogram` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` and the output is the same with the ``melspectrogram`` function applied. The mel scale is a non-linear transformation of frequency scale based on the perception of pitches. The mel scale is calculated so that two pairs of frequencies separated by a delta in the mel scale are perceived by humans as being equidistant. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetMelSpec() # other transforms ) Args: librosa_kwargs (dict): Instantiate the transform with keyword arguments to pass into :func:`librosa.feature.melspectrogram`. """ def __init__(self, *_, **librosa_kwargs): self.librosa_kwargs = librosa_kwargs @tfm_decorators._handle_klio @decorators.handle_binary(load_with_numpy=True, save_with_numpy=True) def process(self, item): element = item.element.decode("utf-8") self._klio.logger.debug( "Generating a Mel spectrogram for {}".format(element) ) yield librosa.feature.melspectrogram( y=item.payload, **self.librosa_kwargs ) class GetMFCC(_base.KlioAudioBaseDoFn): """Calculate MFCCs from a :class:`numpy.ndarray`. This transform wraps :func:`librosa.power_to_db` followed by :func:`librosa.feature.mfcc` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` and the output is the same with the ``mfcc`` function applied. The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) which describe the overall shape of a spectral envelope. It's is often used to describe timbre or model characteristics of human voice. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetMFCC() # other transforms ) Args: librosa_kwargs (dict): Instantiate the transform with keyword arguments to pass into :func:`librosa.feature.mfcc`. """ def __init__(self, *_, **librosa_kwargs): self.librosa_kwargs = librosa_kwargs @tfm_decorators._handle_klio @decorators.handle_binary(load_with_numpy=True, save_with_numpy=True) def process(self, item): element = item.element.decode("utf-8") self._klio.logger.debug( "Generating Mel frequency cepstral coefficients for {}".format( element ) ) # melspectrogram by default returns a power'ed (**2) spectrogram # so we need to convert to decibel units (if it wasn't a power'ed # spec, then we'd use amplitude_to_db) Sdb = librosa.power_to_db(item.payload, ref=np.max) yield librosa.feature.mfcc(S=Sdb, **self.librosa_kwargs) class SpecToPlot(_base.KlioPlotBaseDoFn): """Generate a matplotlib figure of the spectrogram of a :class:`numpy.ndarray`. This transform wraps :func:`librosa.display.specshow` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` of a spectrogram and the output is a :class:`matplotlib.figure.Figure` instance. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetSpec() | audio.SpecToPlot() # other transforms ) Args: title (str): Title of spectrogram plot. Default: ``Spectrogram of {KlioMessage.data.element}``. plot_args (dict): keyword arguments to pass to :func:`librosa.display.specshow`. """ DEFAULT_TITLE = "Spectrogram of {element}" def __init__(self, *_, title=None, **plot_args): super(SpecToPlot, self).__init__(self, title=title, **plot_args) self.plot_args["x_axis"] = self.plot_args.get("x_axis", "time") self.plot_args["y_axis"] = self.plot_args.get("y_axis", "linear") def _plot(self, item, fig): librosa.display.specshow(item.payload, ax=fig.gca(), **self.plot_args) class MelSpecToPlot(_base.KlioPlotBaseDoFn): """Generate a matplotlib figure of the mel spectrogram of a a :class:`numpy.ndarray`. This transform wraps :func:`librosa.power_to_db` followed by :func:`librosa.display.specshow` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` of a melspectrogram and the output is a :class:`matplotlib.figure.Figure` instance. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetMelSpec() | audio.SpecToPlot() # other transforms ) Args: title (str): Title of spectrogram plot. Default: ``Mel-freqency Spectrogram of {KlioMessage.data.element}``. plot_args (dict): keyword arguments to pass to :func:`librosa.display.specshow`. """ DEFAULT_TITLE = "Mel-frequency Spectrogram of {element}" def __init__(self, *_, title=None, **plot_args): super(MelSpecToPlot, self).__init__(self, title=title, **plot_args) self.plot_args["y_axis"] = "mel" self.plot_args["x_axis"] = self.plot_args.get("x_axis", "time") self.plot_args["fmax"] = self.plot_args.get("fmax", 8000) def _plot(self, item, fig): Sdb = librosa.power_to_db(item.payload, ref=np.max) librosa.display.specshow(Sdb, ax=fig.gca(), **self.plot_args) class MFCCToPlot(_base.KlioPlotBaseDoFn): """Generate a matplotlib figure of the MFCCs as a :class:`numpy.ndarray`. This transform wraps :func:`librosa.display.specshow` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` of the MFCCs of an audio and the output is a :class:`matplotlib.figure.Figure` instance. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.GetMFCC() | audio.MFCCToPlot() # other transforms ) Args: title (str): Title of spectrogram plot. Default: ``MFCCs of {KlioMessage.data.element}``. plot_args (dict): keyword arguments to pass to :func:`librosa.display.specshow`. """ DEFAULT_TITLE = "MFCCs of {element}" def __init__(self, *_, title=None, **plot_args): super(MFCCToPlot, self).__init__(self, title=title, **plot_args) self.plot_args["x_axis"] = self.plot_args.get("x_axis", "time") def _plot(self, item, fig): librosa.display.specshow(item.payload, ax=fig.gca(), **self.plot_args) class WaveformToPlot(_base.KlioAudioBaseDoFn): """Generate a matplotlib figure of the wave form of a :class:`numpy.ndarray`. This transform wraps :func:`librosa.display.waveplot` and expects a :class:`PCollection <apache_beam.pvalue.PCollection>` of :ref:`KlioMessages <klio-message>` where the payload is a :class:`numpy.ndarray` of a loaded audio file the output is a :class:`matplotlib.figure.Figure` instance. Example: .. code-block:: python # run.py import apache_beam as beam from klio.transforms import decorators from klio_audio.transforms import audio @decorators.handle_klio def element_to_filename(ctx, data): filename = data.element.decode("utf-8") return f"file:///path/to/audio/{filename}.wav" def run(in_pcol, job_config): return ( in_pcol | beam.Map(element_to_filename) | audio.LoadAudio() | audio.WaveformToPlot() # other transforms ) Args: num_samples (int): Number of samples to plot. Default: ``5000``. title (str): Title of spectrogram plot. Default: ``Waveplot of {KlioMessage.data.element}``. plot_args (dict): keyword arguments to pass to :func:`librosa.display.waveplot`. """ DEFAULT_TITLE = "Waveplot of {element}" def __init__(self, *_, num_samples=5000, title=None, **plot_args): super(WaveformToPlot, self).__init__(self, title=title, **plot_args) self.num_samples = num_samples def _plot(self, item, fig): librosa.display.waveplot( item.payload[: self.num_samples], ax=fig.gca(), **self.plot_args )