Untitled

[y, fs] = audioread('test_whistle.m4a');

frame_len = fs/10;

foi = 440 * 2^(15/12) * (2*ones(80,1)).^((0:79)'/48); % Frequencies of interest
% This uses the logarithmic relation of pitch frequencies in music
% Specifically, this is all the quarter tones from high C to G above high C (in Hz)

cmat = exp(-2*pi*1i*(1/fs)*(foi * (0:frame_len-1)));
% Generate look-up table for certain frequencies in the DFT.
% This is akin to doing the DTFT sum for only certain frequencies.
% Written slightly more legibly,
% cmat = e^(-j2pi * (foi * [0, 1, ..., len - 1])/fs);

for k = 1:100:(length(y) - frame_len - 1)
    range = (k + 1) : k + frame_len;
    frame = y(range); % Select some samples from the audio file

    subplot(2,1,1);
    plot(frame); % Plot the audio data
    ylim([-1,1]); % To make it look pretty

    % frame = frame .* hann(frame_len)'; % Use a Hanning window?

    subplot(2,1,2);
    spectrum = abs(cmat * frame'); % Although the matrix multiplication
    % approach to calculating the DFT is n^2 complexity, I am only
    % analyzing 20 frequencies. I'm not gonna do a 4800 point FFT to
    % then throw away everything but 20 points.
    % However, as it turns out, the FFT is still faster than this matrix
    % mumbo-jumbo...

    stem(foi, spectrum, 'r'); % Plot the "discrete DFT"
    ylim([0,150]); % Make it look nice
    %hold on % For a sanity check, we'll overlap MATLAB's fft readout
    %plot(abs(fft(frame,fs))); % Plot the fft for comparison
    xlim([1000,3500]);
    ylim([0,1000]); % Make it look nice

    hold off
    title(sprintf('The time now is %04.2g s', k / fs));
    pause(0.0005);
end

% Some problems:
% I have a really crude technique here for filtering out noise. What I do
% is simply set anything below a certain level to 0. This was done after
% I looked at the recording myself, and determined an appropriate level.
% Unfortunately, a different level is required for each recording it
% seems.

% When the person whistling doesn't whistle exactly on pitch, the detected
% note is spurious. Furthermore, if their volume modulates, sometimes you
% get a kind of "switch bounce" artefact (clearly seen with test_whistle3,
% at around 2 seconds). We need a robust method for tolerating these kinds
% of imperfections, so that the machine doesn't play a bunch of wrong notes
% on the xylophone.