Untitled

#include <vector>
#include <string>
#include <cmath>
#include <iostream>
#include <sndfile.h>

#ifdef WITH_RLIMIT
#include <sys/resource.h>
#endif


//// Common types ////

typedef float sample_type;
typedef std::vector < sample_type > samples;


//// Math functions ////

static float const pi = 3.1415926535f;

unsigned int gcd(unsigned int p_first, unsigned int p_second)
{
	if (p_first == 0) return p_second;
	if (p_second == 0) return p_first;

	do
	{
		unsigned int temp = p_first % p_second;
		p_first = p_second;
		p_second = temp;
	}
	while (p_second != 0);

	return p_first;
}

unsigned int lcm(unsigned int p_first, unsigned int p_second)
{
	unsigned int temp = gcd(p_first, p_second);
	return (temp != 0) ? (p_first * p_second / temp) : 0;
}

float sinc(float f)
{
	return (f == 0.0f) ? 1.0f : (std::sin(f * pi) / (f * pi));
}

float hanning_func(int n, int N)
{
	return 0.5f * (1.0f - std::cos(2.0f * pi * float(n) / float(N - 1)));
}

samples compute_hanning_window(std::size_t const p_window_size)
{
	samples window(p_window_size, 0);
	for (std::size_t i = 0; i < p_window_size; ++i)
		window[i] = hanning_func(i, p_window_size);
	return window;
}

sample_type convolve(sample_type const *p_first_samples, sample_type const *p_second_samples, std::size_t const p_num_samples)
{
	sample_type result = 0.0f;
	int num_samples = int(p_num_samples);

	for (int first_index = 0; first_index < num_samples; ++first_index)
	{
		int second_index = num_samples - 1 - first_index;

		sample_type first_sample = p_first_samples[first_index];
		sample_type second_sample = p_second_samples[second_index];

		result += first_sample * second_sample;
	}

	return result;
}

samples compute_polyphase_filter_bank(std::size_t const p_upsampling_factor, std::size_t const p_downsampling_factor, std::size_t const p_filter_size)
{
	// Create the window function that will be applied on top of the
	// stretched sinc. The window size equals the filter size plus the
	// upsampling factor, since we need to account for the extra nullsamples that
	// get inserted during zerostuffing.
	samples window = compute_hanning_window(p_filter_size * p_upsampling_factor);

	samples filter(window.size(), 0);

	// We need to create a low pass filter that cuts off frequencies at half of
	// either the input or the output sample rate, whichever is lower. However,
	// it turns out that for the filter calculation, we don't really need the
	// sample rates directly. Instead, we only need to "stretch" sinc by a
	// specific factor. A factor of 1 means no change. A factor of 2 means that
	// half of the original frequency range is cut off etc.
	//
	// If we only want to upsample by a factor of 2, and don't want to downsample,
	// then it is sufficient to stretch the sinc by a factor of 2. This is because
	// when we upsample, we apply zero stuffing, which creates unwantd spectral
	// images above the desired range. So, if for example we upsample by a factor
	// of 2, this means we insert 2-1 = 1 nullsample in between the original
	// samples (this is the zero-stuffing part). Like this:
	//
	// a b c d e f ... -> a 0 b 0 c 0 d 0 e 0 f 0 ...
	//
	// The lower 50% of the spectrum contains the original spectral image. So, we
	// need to get rid of anything except the lower half in the zero-stuffed signal.
	//
	// Another example: Upsampling by a factor of 3, no downsampling. Here,
	// we have 2 unwanted spectral images above the original one, and 3-1 = 2
	// nullsamples were stuffed in between the original samples. Like this:
	//
	// a b c d ... -> a 0 0 b 0 0 c 0 0 d 0 0 ...
	//
	// Since we only want the original image, and its spectrum only makes 1/3rd
	// of the spectrum of the zerostuffed signal, we stretch sinc by a factor of
	// 3, which causes it to lowpass-filter anything except the lowest 33,3% of
	// the spectrum.
	//
	// If we also downsample, then it depends by how much. If the downsampling
	// factor is higher than the upsampling, it means that the downsampled
	// signal will have a Nyquist frequency that is *lower* than that of the
	// original signal. Example: converting from 24000 Hz to 22050 Hz. Here,
	// we need to lowpass-filter to make sure we get rid of all frequencies
	// above 22050/2 = 11025 Hz, otherwise aliasing will occur in the downsampled
	// signal.
	//
	// If instead we downsample to a rate that is *higher* than the original
	// one, we lowpass-filter with the original signal's Nyquist frequency.
	// If we convert from 24000 Hz to 44100 Hz, we lowpass-filter to get rid
	// of all frequencies above 24000 / 2 = 12000 Hz.
	//
	// Again, the actual sample rate does not matter, only the factors do.
	// We simply pick the higher of the two (up/downsampling factors). This
	// corresponds to picking the lower of the two sample rates (input/output
	// sample rates).
	float scale_factor = float(std::max(p_downsampling_factor, p_upsampling_factor));

	// The polyphase filter bank is stored as a one-dimensional array.
	// The polyphase filters are arranged in the array as shown in
	// this example:
	//
	//   AAABBBCCCDDD
	//
	// Where A,B,C,D are the coefficients of polyphase filters A,B,C,D.
	// Upsampling factor is 4 (that's why there are four filters). Each
	// filter has 3 taps in this example.

	// NOTE: It would be useful to reverse the filters, that is,
	// coefficients 1234 -> 4321, to make convolution easier, because
	// then it can be done using a simple multiply-and-add operation.
	// (Convolution does multiply-and-add, but with one of the inputs
	// reversed, so by pre-reversing one, it devolves into a simple
	// multiply-and-add.)

	std::size_t num_filters = p_upsampling_factor;
	std::size_t num_filter_taps = window.size() / num_filters;

	for (std::size_t i = 0; i < window.size(); ++i)
	{
		// Note that we do _not_ scale t by the window size here. So, we do not
		// normalize it in any way. This is intentional: The filter size defines
		// the _quality_ of the filtering. Larger filter means more sinc coefficients,
		// or in other words, we tap a larger range of sinc. If we normalized t,
		// we would always tap the _same_ range of the sinc function (the area
		// around the main lobe).
		// We do however offset t to make sure it ranges from -w/2 to w/2-1 instead of
		// 0 to w-1 . Otherwise we don't get a symmetric sinc tap.
		// TODO: Should it be from -w to +w instead?
		float t = int(i) - int(window.size() / 2);

		// Compute stretched sinc.
		float f = sinc(t / scale_factor);

		std::size_t polyphase_filter_index = i % num_filters;
		std::size_t offset_in_polyphase_filter = i / num_filters;
		std::size_t filter_array_idx = polyphase_filter_index * num_filter_taps + offset_in_polyphase_filter;

		// Apply the window function on top of the sinc
		// function to produce filter coefficients.
		filter[filter_array_idx] = f * window[i];
	}

	// TODO: It is unclear why this is necessary, but without this,
	// the output signal may be incorrectly amplified.
	if (p_downsampling_factor > p_upsampling_factor)
	{
		float dampening_factor = float(p_upsampling_factor) / p_downsampling_factor;
		for (auto & filter_coefficient : filter)
			filter_coefficient *= dampening_factor;
	}

	return filter;
}


//// Sound I/O functionality ////

class sound_file
{
public:
	sound_file()
		: m_sndfile(nullptr)
		, m_num_samples(0)
		, m_sample_rate(0)
	{
	}

	bool open_for_reading(std::string const &p_filename)
	{
		if (m_sndfile != nullptr)
		{
			std::cerr << "A file has already been opened\n";
			return false;
		}

		SF_INFO info;
		info.format = 0;

		if (!open_internal(p_filename, true, info))
			return false;

		if (info.channels != 1)
		{
			sf_close(m_sndfile);
			m_sndfile = nullptr;
			std::cerr << "File \"" << p_filename << "\" has " << info.channels << " channels; only mono is supported" << std::endl;
			return false;
		}

		// Technically, this is wrong, since 1 frame is a collection of N
		// samples, with N being the number of channels. But, since we
		// anyway only support mono in this example, it does not matter.
		m_num_samples = info.frames;
		m_sample_rate = info.samplerate;

		return true;
	}

	bool open_for_writing(std::string const &p_filename, unsigned int const p_sample_rate)
	{
		if (m_sndfile != nullptr)
		{
			std::cerr << "A file has already been opened\n";
			return false;
		}

		SF_INFO info;
		info.format = SF_FORMAT_WAV | SF_FORMAT_FLOAT;
		info.samplerate = p_sample_rate;
		info.channels = 1;

		if (!open_internal(p_filename, false, info))
			return false;

		m_num_samples = 0;
		m_sample_rate = p_sample_rate;

		return true;
	}

	~sound_file()
	{
		if (m_sndfile != nullptr)
			sf_close(m_sndfile);
	}

	std::size_t get_total_num_input_samples() const
	{
		return m_num_samples;
	}

	unsigned int get_sample_rate() const
	{
		return m_sample_rate;
	}

	std::size_t read_all_samples(samples &p_samples)
	{
		return read_samples(p_samples, m_num_samples);
	}

	std::size_t read_samples(samples &p_samples, std::size_t const p_num_samples_to_read)
	{
		if (p_num_samples_to_read > p_samples.size())
			p_samples.resize(p_num_samples_to_read);

		return sf_read_float(m_sndfile, &p_samples[0], p_num_samples_to_read);
	}

	void write_samples(samples const &p_samples, std::size_t p_num_samples_to_write)
	{
		if (p_num_samples_to_write > p_samples.size())
			p_num_samples_to_write = p_samples.size();

		sf_write_float(m_sndfile, &p_samples[0], p_num_samples_to_write);
	}


private:
	bool open_internal(std::string const &p_filename, bool const p_read, SF_INFO &p_info)
	{
		m_sndfile = sf_open(p_filename.c_str(), p_read ? SFM_READ : SFM_WRITE, &p_info);

		if (m_sndfile == nullptr)
		{
			std::cerr << "Could not open file \"" << p_filename << "\" for " << (p_read ? "reading" : "writing") << std::endl;
		}

		return (m_sndfile != nullptr);
	}

	SNDFILE *m_sndfile;
	std::size_t m_num_samples;
	unsigned int m_sample_rate;
};


//// main ////

int main(int argc, char *argv[])
{
	// Get the command line arguments.
	if (argc < 5)
	{
		std::cerr << "Usage: " << argv[0] << " <input filename> <output WAV filename> <output sample rate> <filter size>\n";
		return -1;
	}

	std::string input_filename = argv[1];
	std::string output_filename = argv[2];
	unsigned int output_sample_rate = std::stoul(argv[3]);
	std::size_t filter_size = std::stoul(argv[4]);


#ifdef WITH_RLIMIT
	// Limit the amount of memory this process can allocate to 1 GB to
	// prevent lots of swap activity in case we allocate too much. Since
	// this example just loads the entirety of the input file to memory,
	// this can happen with long tracks. With this rlimit, the process
	// is killed as soon as the limit is hit.
	struct rlimit memlim;
	getrlimit(RLIMIT_AS, &memlim);
	memlim.rlim_cur = std::min(std::size_t(1024*1024*1024), memlim.rlim_cur);
	setrlimit(RLIMIT_AS, &memlim);
#endif


	// Open the input file.
	// NOTE: In this example, we read and sample rate convert the entire
	// input file at once. This is for sake of clarity and simplicity.
	// In production, the conversion would not be done that way. Instead,
	// the input samples would be streamed in and converted on the fly.
	// This saves a ton of memory, and would also work with live streams.

	sound_file input_sound_file;
	if (!input_sound_file.open_for_reading(input_filename))
		return -1;
	samples input_samples;
	input_sound_file.read_all_samples(input_samples);
	if (input_samples.empty())
	{
		std::cerr << "Did not get any input samples\n";
		return -1;
	}

	unsigned int input_sample_rate = input_sound_file.get_sample_rate();


	// Sample rate conversion works by first interpolating the signal
	// to a higher sample rate, then decimating to a lower sample rate.
	//
	// Interpolation means that new samples are introduced in between
	// the original input samples, followed by a lowpass filter that is
	// applied on that augmented input signal. The signal is augmented
	// by inserting zeros in between the original samples. This is
	// called "zero-stuffing". Zero-stuffing introduces copies of the
	// original spectrum, _above_ the original spectrum. This is where
	// the low-pass filter comes in - it cuts off these unwanted copies,
	// leaving us with an upsampled version of the original signal.
	//
	// So, if we want to upsample the signal by an integer factor N,
	// we insert N-1 nullsamples between the original samples.
	//
	// Decimation can then be applied. This simply involves picking
	// every Mth sample, M being the decimation factor. M=1 means no
	// decimation.
	//
	// In short, first we upsample by N by applying interpolation,
	// then we downsample by M by applying decimation. The M/N ratio
	// is the overall sample rate conversion ratio. Upsampling factor
	// N is also the interpolation factor N. Downsampling factor M
	// is also the decimation factor M.
	//
	// The up/downsampling factors are derived from the input and
	// output sample rates. To that end, the least common denominator
	// of the sample rates is determined. That's because the up- and
	// downsampling factors influence filter sizes, and we want filters
	// to not be unnecessarily large. For example, input sample rate
	// 48000 Hz, output sample rate 44100 Hz, that's a ratio of
	// 48000/44100. We could use N=48000, M=44100, but for better
	// efficiency (as explained above), we reduce this and come up
	// with an equal ratio of 160/147.
	//
	// Once the factors are known, we could in theory get the input
	// signal's samples and stuff in nullsamples between these samples.
	// However, that would be wasteful (in the example above it would
	// increase the input signal size by a factor of 160). We can
	// make the observation that during convolution, these nullsamples
	// don't contribute to the output, because these nullsamples are
	// multiplied with filter coefficients and added. So, an equation
	// like:
	//
	//   output = sample * coeff1 + 0 * coeff2 + 0 * coeff3 ...
	//
	// will only really be influenced by the original samples, not
	// by the added nullsamples.
	//
	// The idea then is to simply omit these nullsamples and only
	// pick the coefficients that would actually be applied (in the
	// example above, coeff1 would be applied, while coeff2 and
	// coeff3 would not). We observe that in a zerostuffed signal,
	// the original samples appear in every Nth position. So, in this
	// zerostuffed signal:
	//
	// a 0 0 b 0 0 c ..
	//
	// every 3rd sample is an original sample. If we have a filter
	// with 12 taps, then its coefficients can be decomposed into
	// subfilters. The number of filters equal the upsampling factor.
	// In this example, the decomposition would be:
	//
	// Original filter: h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12
	// Subfilter 1:     h1       h4       h7       h10
	// Subfilter 2:        h2       h5       h8        h11
	// Subfilter 3:           h3       h6       h9         h12
	//
	// During decimation, we would normally pick a sample from the
	// interpolated signal. Suppose that we didn't use polyphase
	// filters, but instead actually did use zerostuffing and
	// filtered that zerostuffed signal. Suppose upsampling factor
	// N is 3, downsampling factor M is 2.
	//
	// Original signal: i1 i2 i3 i4 i5 i6 i7 ..
	// Zerostuffed signal: i1 0 0 i2 0 0 i3 0 0 i4 0 0 i5 0 0 ..
	// Interpolated signal: i1 a b i2 c d i3 e f i4 g h i6 i j ..
	//   (a-j are newly interpolated samples that resulted from
	//   convolving the filter with the zerostuffed signal)
	//
	// Decimation factor M = 2 means we pick every 2nd
	// interpolated sample, and get the output signal
	// i1 b c i3 f g i6 ..
	//
	// We optimize this by using polyphase filters, eliminating
	// the need for an intermediate zerostuffed signal. Instead,
	// we pick the subfilter that would produce the sample that
	// we pick during decimation.
	//
	// From the example above, suppose we are about to pick
	// sample "b". to compute "b", we pick the appropriate
	// subfilter. We find this subfilter by computing the
	// position in the interpolated signal. This we call the
	// "interpolated position". In the case of "b", this position
	// would be 2. (Positions start at 0.) We then apply modulo 3
	// (the number of subfilters) to pick the appropriate subfilter.
	// That's subfilter (2 mod 3) = 2 in this case.
	//
	// Then we need the position in the original input signal that
	// corresponds to the "b" sample". This we get by calculating
	// position_in_original_signal = position_in_output_signal * M / N
	// So, in this case, "b" is at position #1 in the output signal,
	// and 1*2/3 = 0. Now we have the position in the input signal
	// where we get the input samples that we want to convolve with
	// the subfilter #2 we picked earlier. This means that this
	// subfilter is convolved with input samples i1 to i4.
	//
	// Another example would be if we wanted to compute sample "g",
	// which is at "interpolated position" position 10 and at output
	// position 5. 10 mod 3 = 1, and 5*2/3 = 3. So, we would convolve
	// input samples i4 to i7 with subfilter #1.


	// Compute the up- and downsampling factors.
	unsigned int sample_rate_lcm = lcm(input_sample_rate, output_sample_rate);
	// This is the interpolation factor N mentioned above.
	unsigned int upsampling_factor = sample_rate_lcm / input_sample_rate;
	// This is the decimation factor M mentioned above.
	unsigned int downsampling_factor = sample_rate_lcm / output_sample_rate;


	// Print some info.
	std::cerr << "Input / output sample rate: " << input_sample_rate << " Hz / " << output_sample_rate << " Hz\n";
	std::cerr << "Up / downsampling factors: " << upsampling_factor << " / " << downsampling_factor << "\n";
	std::cerr << "Filter size: " << filter_size << "\n";


	// Prepare the polyphase filter and the output samples buffer.
	samples polyphase_filter_bank = compute_polyphase_filter_bank(upsampling_factor, downsampling_factor, filter_size);
	samples output_samples(input_samples.size() * upsampling_factor / downsampling_factor, 0);

	std::size_t num_polyphase_filters = upsampling_factor;
	std::size_t polyphase_filter_size = polyphase_filter_bank.size() / num_polyphase_filters;


	// Open the output file.
	sound_file output_sound_file;
	if (!output_sound_file.open_for_writing(output_filename, output_sample_rate))
		return -1;


	// Now perform the sample rate conversion.
	std::size_t output_position = 0;
	std::size_t interpolated_position = 0;
	std::size_t last_progress = 0;
	// Don't iterate over the last filter_size samples. That's because
	// convolve() will access all samples from output_position to
	// (output_position + polyphase_filter_size - 1).
	for (; output_position < output_samples.size() - filter_size; ++output_position, interpolated_position += downsampling_factor)
	{
		// Print some progress. We keep track of the last computed
		// progress to check if we should actually print something.
		// Otherwise, the constant console output could actually
		// slow down this code (and it floods the console with
		// lines of course).
		std::size_t progress = output_position / 100000;
		if (progress != last_progress)
		{
			std::cerr << output_position << "/" << output_samples.size() << "(" << (float(output_position) / float(output_samples.size()) * 100.0f) << "%)\n";
			last_progress = progress;
		}

		// Pick the right subfilter.
		std::size_t polyphase_filter_index = interpolated_position % num_polyphase_filters;
		sample_type const *polyphase_filter_coefficients = &(polyphase_filter_bank[polyphase_filter_index * polyphase_filter_size]);

		// Pick what input samples we need to convolve with the
		// chosen subfilter.
		sample_type const *input_samples_coefficients = &(input_samples[output_position * downsampling_factor / upsampling_factor]);

		// Perform the convolution, producing the sample rate
		// converted output.
		output_samples[output_position] = convolve(input_samples_coefficients, polyphase_filter_coefficients, polyphase_filter_size);
	}


	// Write the result.
	output_sound_file.write_samples(output_samples, output_samples.size());


	return 0;
}