basic_pitch/visualize.py (138 lines of code) (raw):
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright 2022 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import tensorflow as tf
import mir_eval
import librosa
from typing import Dict
from basic_pitch.constants import (
AUDIO_SAMPLE_RATE,
ANNOTATIONS_FPS,
ANNOTATIONS_N_SEMITONES,
ANNOTATIONS_BASE_FREQUENCY,
ANNOT_N_FRAMES,
NOTES_BINS_PER_SEMITONE,
AUDIO_N_SAMPLES,
)
from basic_pitch import models
SONIFY_FS = 3000
MAX_OUTPUTS = 4
FREQS = librosa.core.cqt_frequencies(
n_bins=ANNOTATIONS_N_SEMITONES * NOTES_BINS_PER_SEMITONE,
fmin=ANNOTATIONS_BASE_FREQUENCY,
bins_per_octave=12 * NOTES_BINS_PER_SEMITONE,
)
# this function is slow - for speed, only sonify frequencies below
# sonify_fs/2 Hz (e.g. 1000 Hz)
MAX_FREQ_INDEX = np.where(FREQS > SONIFY_FS / 2)[0][0]
TIMES = librosa.core.frames_to_time(
np.arange(ANNOT_N_FRAMES),
sr=AUDIO_SAMPLE_RATE,
hop_length=AUDIO_SAMPLE_RATE // ANNOTATIONS_FPS,
)
def get_input_model() -> tf.keras.Model:
"""define a model that generates the CQT (Constant-Q Transform) of input audio"""
inputs = tf.keras.Input(shape=(AUDIO_N_SAMPLES, 1)) # (batch, time, ch)
x = models.get_cqt(inputs, 1, False)
model = tf.keras.Model(inputs=inputs, outputs=x)
model.compile()
return model
INPUT_MODEL = get_input_model()
def visualize_transcription(
file_writer: tf.summary.SummaryWriter,
stage: str,
inputs: tf.Tensor,
targets: Dict[str, tf.Tensor],
outputs: Dict[str, tf.Tensor],
loss: float,
step: int,
sonify: bool = True,
contours: bool = True,
) -> None:
"""Create tf.summaries of transcription outputs to be plotted in tensorboard
Args:
file_writer: tensorboard filewriter object
stage: train or validation
inputs: batch of input data (audio)
targets: batch of target data (dictionary)
outputs: batch of output data (dictionary)
loss: loss value for epoch
step: which epoch this is
sonify: sonify outputs
contours: plot note contours
"""
with file_writer.as_default():
# create audio player
tf.summary.audio(
f"{stage}/audio/inputs",
inputs,
sample_rate=AUDIO_SAMPLE_RATE,
step=step,
max_outputs=MAX_OUTPUTS,
)
# plot mel spectrograms
tf.summary.image(
f"{stage}/audio/input",
_audio_input(inputs),
step=step,
max_outputs=MAX_OUTPUTS,
)
# plot onsets
tf.summary.image(
f"{stage}/images/onsets/target",
_array_to_image(targets["onset"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
tf.summary.image(
f"{stage}/images/onsets/output",
_array_to_image(outputs["onset"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
if sonify:
tf.summary.audio(
f"{stage}/audio/onsets-output",
_array_to_sonification(outputs["onset"], MAX_OUTPUTS),
sample_rate=SONIFY_FS,
step=step,
max_outputs=MAX_OUTPUTS,
)
if contours:
# plot contours
tf.summary.image(
f"{stage}/images/contours/target",
_array_to_image(targets["contour"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
tf.summary.image(
f"{stage}/images/contours/output",
_array_to_image(outputs["contour"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
# plot notes
tf.summary.image(
f"{stage}/images/notes/target",
_array_to_image(targets["note"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
tf.summary.image(
f"{stage}/images/notes/output",
_array_to_image(outputs["note"]),
step=step,
max_outputs=MAX_OUTPUTS,
)
if sonify:
# sonify notes
tf.summary.audio(
f"{stage}/audio/notes-output",
_array_to_sonification(outputs["note"], MAX_OUTPUTS),
sample_rate=SONIFY_FS,
step=step,
max_outputs=MAX_OUTPUTS,
)
# plot loss
tf.summary.scalar(f"{stage}/loss", loss, step=step)
# plot max
if contours:
tf.summary.scalar(f"{stage}/contour-max", np.max(outputs["contour"]), step=step)
tf.summary.scalar(f"{stage}/note-max", np.max(outputs["note"]), step=step)
tf.summary.scalar(f"{stage}/onset-max", np.max(outputs["onset"]), step=step)
def _array_to_sonification(array: tf.Tensor, max_outputs: int, clip: float = 0.3) -> tf.Tensor:
"""sonify time frequency representation of audio
Args:
array: time-frequency representation of audio
max_outputs: the number of grams / batches to process / append to the resulting output
clip: value below which signal is 0'd out.
"""
gram_batch = tf.transpose(array, perm=[0, 2, 1]).numpy()
audio_list = []
for i, gram in enumerate(gram_batch):
gram[gram < clip] = 0.0
y = mir_eval.sonify.time_frequency(
gram[:MAX_FREQ_INDEX, :],
FREQS[:MAX_FREQ_INDEX],
TIMES,
fs=SONIFY_FS,
)
audio_list.append(y[:, np.newaxis])
if i + 1 >= max_outputs:
break
return tf.convert_to_tensor(np.array(audio_list), dtype=tf.float32)
def _audio_input(audio: tf.Tensor) -> tf.Tensor:
"""Gets the Constant-Q transform of audio input using the input model defined above.
Args:
audio: the audio signal to process
Returns:
constant-q transform of the audio (3 bins per semitone, ~11ms hop size.)
"""
audio_in = INPUT_MODEL(audio)
return tf.transpose(audio_in, perm=[0, 2, 1, 3])
def _array_to_image(array: tf.Tensor) -> tf.Tensor:
"""Convert a time-frequency array shape=(batch, time, frequency) to
the shape expected by tf.summary.image (batch, frequency, time, 1)
Args:
array: a (batch, time, frequency) array
Returns:
reshaped array
"""
return tf.expand_dims(tf.transpose(array, perm=[0, 2, 1]), 3)