Untitled

import edge_tts
import asyncio
import librosa
import soundfile
import io
import sounddevice as sd
import argparse

from inference.infer_tool import Svc

def text2tts(text, voice, output_file):
    asyncio.run(edge_tts.Communicate(text, voice).save(output_file))
    audio, sr = librosa.load(output_file, sr=16000, mono=True)
    raw_path = io.BytesIO()
    soundfile.write(raw_path, audio, 16000, format="wav")
    raw_path.seek(0)
    return raw_path

def tts2herta(raw_path, model):
    out_audio, out_sr = model.infer('speaker0', 0, raw_path, auto_predict_f0=True)
    outsound = out_audio.cpu().numpy()
    return outsound

def main(text, output_file):
    voice = "ru-RU-SvetlanaNeural"
    model = Svc("Herta-Svc/G_10000.pth", "Herta-Svc/config.json", device='cpu')

    raw_path = text2tts(text, voice, output_file)
    out = tts2herta(raw_path, model)
    soundfile.write('out_audio.wav', out, 44100)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text-to-Speech using Herta-Svc")
    parser.add_argument("-text", type=str, required=True, help="The text to convert to speech")
    parser.add_argument("-file_name", type=str, default="output.wav", help="The output file name")
    args = parser.parse_args()

    main(args.text, args.file_name)