Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import typing as T
- import numpy as np
- from PIL import Image
- import pydub
- from scipy.io import wavfile
- import torch
- import torchaudio
- def spectrogram_image_from_wav(wav_bytes: io.BytesIO, max_volume: float = 50, power_for_image: float = 0.25) -> Image.Image:
- """
- Generate a spectrogram image from a WAV file.
- """
- # Read WAV file from bytes
- sample_rate, waveform = wavfile.read(wav_bytes)
- sample_rate = 44100 # [Hz]
- clip_duration_ms = 5000 # [ms]
- bins_per_image = 512
- n_mels = 512
- mel_scale = True
- # FFT parameters
- window_duration_ms = 100 # [ms]
- padded_duration_ms = 400 # [ms]
- step_size_ms = 10 # [ms]
- # Derived parameters
- num_samples = int(512 / float(bins_per_image) * clip_duration_ms) * sample_rate
- n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
- hop_length = int(step_size_ms / 1000.0 * sample_rate)
- win_length = int(window_duration_ms / 1000.0 * sample_rate)
- # Compute spectrogram from waveform
- Sxx = spectrogram_from_waveform(
- waveform=waveform,
- sample_rate=sample_rate,
- n_fft=n_fft,
- hop_length=hop_length,
- win_length=win_length,
- mel_scale=mel_scale,
- n_mels=n_mels,
- )
- # Convert spectrogram to image
- image = image_from_spectrogram(Sxx, max_volume=max_volume, power_for_image=power_for_image)
- return image
- def spectrogram_from_waveform(
- waveform: np.ndarray,
- sample_rate: int,
- n_fft: int,
- hop_length: int,
- win_length: int,
- mel_scale: bool = True,
- n_mels: int = 512,
- ) -> np.ndarray:
- """
- Compute a spectrogram from a waveform.
- """
- spectrogram_func = torchaudio.transforms.Spectrogram(
- n_fft=n_fft,
- power=None,
- hop_length=hop_length,
- win_length=win_length,
- )
- waveform_tensor = torch.from_numpy(waveform.astype(np.float32)).reshape(1, -1)
- Sxx_complex = spectrogram_func(waveform_tensor).numpy()[0]
- Sxx_mag = np.abs(Sxx_complex)
- if mel_scale:
- mel_scaler = torchaudio.transforms.MelScale(
- n_mels=n_mels,
- sample_rate=sample_rate,
- f_min=0,
- f_max=10000,
- n_stft=n_fft // 2 + 1,
- norm=None,
- mel_scale="htk",
- )
- Sxx_mag = mel_scaler(torch.from_numpy(Sxx_mag)).numpy()
- return Sxx_mag
- def image_from_spectrogram(
- Sxx: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
- ) -> Image.Image:
- """
- Generate an image from a spectrogram magnitude array.
- TODO(hayk): Add spectrogram_from_image and call this out as the reverse.
- """
- # Apply power curve
- data = np.power(Sxx, power_for_image)
- # Rescale to 255
- data = data * 255 / max_volume
- # Invert
- data = 255 - data
- # Flip Y
- data = data[::-1, :]
- # Convert to 8-bit unsigned integer
- data = data.astype(np.uint8)
- # Create image
- image = Image.fromarray(data, mode="L")
- return image
- # Open WAV file
- with open('music.wav', 'rb') as f:
- wav_bytes = io.BytesIO(f.read())
- # Generate spectrogram image
- image = spectrogram_image_from_wav(wav_bytes)
- # Save image to file
- image.save('restoredinput.png')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement