Untitled

import torch
import time
import outetts

device = torch.device("cuda:0")

# Configure the GGUF model
model_config = outetts.GGUFModelConfig_v1(
    model_path="OuteTTS-0.2-500M-Q6_K.gguf",
    language="en",  # Supported languages in v0.2: en, zh, ja, ko
    n_gpu_layers=24,
    device=device
)

interface = outetts.InterfaceGGUF(model_version="0.2", cfg=model_config)  # Initialize the GGUF interface
speaker = interface.load_default_speaker(name="male_1")  # Load a default speaker profile

def calculate_audio_duration(output):
    audio_tensor = output.audio
    sample_rate = output.sr
    num_samples = audio_tensor.size(1)

    print(f"Samples: {num_samples}")
    print(f"Sample Rate: {sample_rate}")

    return num_samples / sample_rate

# Function to perform inference and time it
def synthesize(text):
    start_time = time.time()  # Record start time
    output = interface.generate(
        text=text,
        temperature=0.1,
        repetition_penalty=1.1,
        max_length=4096,
        speaker=speaker,
    )
    end_time = time.time()  # Record end time

    inference_time = end_time - start_time  # Calculate duration
    audio_duration = calculate_audio_duration(output)
    print(f"Inference time for '{text}': {audio_duration:.4f} seconds audio in {inference_time:.4f} seconds")
    return output

# Text to synthesize
text_to_synthesize_1 = "Speech synthesis is the artificial production of human speech. A computer system used for this purpose is called a speech synthesizer, and it can be implemented in software or hardware products."
text_to_synthesize_2 = "Hello my boy"

# Perform synthesis and timing for both texts
synthesize(text_to_synthesize_1).save("output1.wav")