in basic_pitch/models.py [0:0]
def get_cqt(inputs: tf.Tensor, n_harmonics: int, use_batchnorm: bool) -> tf.Tensor:
"""Calculate the CQT of the input audio.
Input shape: (batch, number of audio samples, 1)
Output shape: (batch, number of frequency bins, number of time frames)
Args:
inputs: The audio input.
n_harmonics: The number of harmonics to capture above the maximum output frequency.
Used to calculate the number of semitones for the CQT.
use_batchnorm: If True, applies batch normalization after computing the CQT
Returns:
The log-normalized CQT of the input audio.
"""
n_semitones = np.min(
[
int(np.ceil(12.0 * np.log2(n_harmonics)) + ANNOTATIONS_N_SEMITONES),
MAX_N_SEMITONES,
]
)
x = nn.FlattenAudioCh()(inputs)
x = nnaudio.CQT(
sr=AUDIO_SAMPLE_RATE,
hop_length=FFT_HOP,
fmin=ANNOTATIONS_BASE_FREQUENCY,
n_bins=n_semitones * CONTOURS_BINS_PER_SEMITONE,
bins_per_octave=12 * CONTOURS_BINS_PER_SEMITONE,
)(x)
x = signal.NormalizedLog()(x)
x = tf.expand_dims(x, -1)
if use_batchnorm:
x = tfkl.BatchNormalization()(x)
return x