Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from flask import Flask, render_template, request, jsonify
- import requests
- import os
- import soundfile as sf
- import numpy as np
- from kokoro import KPipeline # Your existing Kokoro TTS
- import uuid
- import base64
- import io # For handling in-memory audio
- # Attempt to import NeMo ASR
- try:
- import nemo.collections.asr as nemo_asr
- print("NVIDIA NeMo ASR Toolkit imported successfully.")
- except ImportError:
- print("NVIDIA NeMo ASR Toolkit not found. Please install it: pip install nemo_toolkit['asr']")
- nemo_asr = None
- except Exception as e:
- print(f"Error importing NeMo ASR: {e}")
- nemo_asr = None
- app = Flask(__name__)
- # --- Configuration ---
- OLLAMA_API_URL = "http://localhost:11434/api/generate"
- OLLAMA_TAGS_URL = "http://localhost:11434/api/tags"
- MODEL_DIR = "models" # General models directory
- ASR_MODEL_DIR = os.path.join(MODEL_DIR, "asr_models") # Specific for ASR
- KOKORO_MODEL_PATH = os.path.join(MODEL_DIR, "kokoro-v1.0.onnx")
- KOKORO_VOICES_PATH = os.path.join(MODEL_DIR, "voices-v1.0.bin")
- KOKORO_LANG_CODE = "a"
- VOICE = "af_bella"
- # Parakeet ASR model configuration
- ASR_MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
- # --- Initialize Services ---
- os.makedirs(MODEL_DIR, exist_ok=True)
- os.makedirs(ASR_MODEL_DIR, exist_ok=True) # Ensure ASR model directory exists
- os.makedirs("static", exist_ok=True) # For temporary audio files if needed
- # Initialize Kokoro TTS
- tts_pipeline = None
- try:
- # Pass repo_id explicitly to suppress the warning if you know it, e.g. from hexgrad for Kokoro
- # For Kokoro, if it's from hexgrad/Kokoro-82M, you could try passing repo_id='hexgrad/Kokoro-82M'
- # However, KPipeline itself might not accept repo_id. This warning is from underlying Hugging Face libs Kokoro might use.
- # For now, we'll let the warning be, as Kokoro seems to initialize.
- if os.path.exists(KOKORO_MODEL_PATH) and os.path.exists(KOKORO_VOICES_PATH):
- tts_pipeline = KPipeline(lang_code=KOKORO_LANG_CODE) # repo_id='hexgrad/Kokoro-82M' # example if KPipeline supported it
- print("Kokoro TTS initialized successfully.")
- else:
- print("Kokoro TTS model/voice files not found. Skipping initialization.")
- except Exception as e:
- print(f"Error initializing Kokoro TTS: {str(e)}")
- # Initialize NeMo ASR Model
- asr_model_instance = None
- if nemo_asr:
- try:
- print(f"Loading ASR model: {ASR_MODEL_NAME}...")
- asr_model_instance = nemo_asr.models.ASRModel.from_pretrained(
- model_name=ASR_MODEL_NAME,
- map_location='cpu' # Use 'cuda' if you have a GPU and CUDA installed
- )
- asr_model_instance.eval() # Set to evaluation mode
- print(f"ASR model '{ASR_MODEL_NAME}' loaded successfully.")
- except Exception as e:
- print(f"Error loading ASR model '{ASR_MODEL_NAME}': {str(e)}")
- print("ASR will not be available.")
- else:
- print("NeMo ASR toolkit not available. ASR functionality will be disabled.")
- def download_kokoro_model():
- if not os.path.exists(KOKORO_MODEL_PATH) or not os.path.exists(KOKORO_VOICES_PATH):
- print("Attempting to download Kokoro TTS model files...")
- try:
- model_url = "https://github.com/nazdridoy/kokoro-tts/releases/download/v1.0.0/kokoro-v1.0.onnx"
- voices_url = "https://github.com/nazdridoy/kokoro-tts/releases/download/v1.0.0/voices-v1.0.bin"
- for url, path in [(model_url, KOKORO_MODEL_PATH), (voices_url, KOKORO_VOICES_PATH)]:
- if not os.path.exists(path):
- print(f"Downloading {url} to {path}")
- response = requests.get(url, stream=True)
- response.raise_for_status()
- with open(path, "wb") as f:
- for chunk in response.iter_content(chunk_size=8192):
- f.write(chunk)
- global tts_pipeline # Declare tts_pipeline as global to modify it
- if not tts_pipeline and os.path.exists(KOKORO_MODEL_PATH) and os.path.exists(KOKORO_VOICES_PATH):
- tts_pipeline = KPipeline(lang_code=KOKORO_LANG_CODE)
- print("Kokoro TTS initialized successfully after download.")
- except Exception as e:
- print(f"Error downloading Kokoro model: {str(e)}")
- download_kokoro_model()
- def get_ollama_models():
- try:
- response = requests.get(OLLAMA_TAGS_URL)
- response.raise_for_status()
- models_data = response.json().get("models", [])
- return [model["name"] for model in models_data]
- except requests.RequestException as e:
- print(f"Error fetching Ollama models: {str(e)}")
- return ["llama2:latest"]
- @app.route('/')
- def index():
- models = get_ollama_models()
- return render_template('index2.html', models=models)
- @app.route('/process_voice_input', methods=['POST'])
- def process_voice_input():
- if 'audio_data' not in request.files:
- return jsonify({"error": "No audio data found in request"}), 400
- if not asr_model_instance:
- return jsonify({"error": "ASR model not available on server"}), 500
- audio_file = request.files['audio_data']
- temp_audio_filename = f"temp_audio_{uuid.uuid4()}.wav"
- temp_audio_path = os.path.join("static", temp_audio_filename)
- try:
- audio_file.save(temp_audio_path)
- print(f"Temporary audio file saved to: {temp_audio_path}")
- # Transcribe audio using NeMo ASR model
- transcription_results = asr_model_instance.transcribe([temp_audio_path])
- user_input_text = "" # Initialize to empty
- if transcription_results:
- # NeMo's transcribe can return:
- # 1. A list of strings (if not using N-best, simple case)
- # 2. A list of lists of Hypothesis objects (if using N-best, even with N=1)
- # Based on your log: [Hypothesis(text="Hey, what's up?", ...)]
- # This means transcription_results is a list, and its first element is the Hypothesis object.
- first_file_result = transcription_results[0] # Result for the first (and only) file
- if isinstance(first_file_result, str):
- user_input_text = first_file_result
- elif isinstance(first_file_result, list) and len(first_file_result) > 0:
- # This would be for N-best lists, where first_file_result is a list of Hypothesis objects
- if hasattr(first_file_result[0], 'text'):
- user_input_text = first_file_result[0].text # Get text from the top hypothesis
- else:
- print(f"Warning: Top hypothesis object in N-best list lacks 'text' attribute: {first_file_result[0]}")
- elif hasattr(first_file_result, 'text'):
- # This covers the case where first_file_result is a single Hypothesis object
- # (e.g., when transcribe returns a list of Hypothesis objects, one per input file)
- user_input_text = first_file_result.text
- else:
- print(f"Warning: Transcription result format for the file was unexpected: {first_file_result}")
- else:
- print("Warning: ASR returned no transcription results at all.")
- print(f"Transcribed text: '{user_input_text}'")
- except Exception as e:
- print(f"Error during ASR transcription: {str(e)}")
- return jsonify({"error": f"ASR transcription error: {str(e)}"}), 500
- finally:
- if os.path.exists(temp_audio_path):
- try:
- os.remove(temp_audio_path)
- print(f"Temporary audio file {temp_audio_path} removed.")
- except Exception as e_remove:
- print(f"Error removing temporary audio file {temp_audio_path}: {e_remove}")
- if not user_input_text.strip():
- return jsonify({
- "transcribed_text": user_input_text,
- "text": "Could not understand audio or audio was silent.",
- "audio": None
- })
- selected_model = request.form.get('model', get_ollama_models()[0])
- system_prompt = request.form.get('system_prompt', "You are a helpful, friendly AI assistant.")
- available_models = get_ollama_models()
- if selected_model not in available_models:
- selected_model = available_models[0] if available_models else "llama2:latest"
- try:
- ollama_payload = {
- "model": selected_model,
- "prompt": user_input_text,
- "system": system_prompt,
- "stream": False
- }
- print(f"Sending to Ollama: {ollama_payload}")
- ollama_response = requests.post(OLLAMA_API_URL, json=ollama_payload)
- ollama_response.raise_for_status()
- llm_response_text = ollama_response.json().get("response", "")
- print(f"Ollama response: '{llm_response_text}'")
- except requests.RequestException as e:
- print(f"Ollama API error: {str(e)}")
- return jsonify({"error": f"Ollama API error: {str(e)}", "transcribed_text": user_input_text}), 500
- tts_audio_base64 = None
- if tts_pipeline and llm_response_text:
- try:
- generator = tts_pipeline(llm_response_text, voice=VOICE)
- audio_chunks = [audio for _, _, audio in generator]
- if audio_chunks:
- final_audio_np = np.concatenate(audio_chunks)
- wav_buffer = io.BytesIO()
- sf.write(wav_buffer, final_audio_np, 24000, format='WAV', subtype='PCM_16')
- wav_buffer.seek(0)
- tts_audio_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
- print("Kokoro TTS audio generated.")
- else:
- print("Kokoro TTS produced no audio chunks.")
- except Exception as e:
- print(f"Kokoro TTS generation error: {str(e)}")
- llm_response_text += " (TTS Error)"
- elif not tts_pipeline:
- print("Kokoro TTS pipeline not available.")
- elif not llm_response_text:
- print("LLM response was empty, skipping TTS.")
- return jsonify({
- "transcribed_text": user_input_text,
- "text": llm_response_text,
- "audio": tts_audio_base64
- })
- @app.route('/process_typed_text', methods=['POST'])
- def process_typed_text():
- data = request.json
- user_input_text = data.get('text')
- selected_model = data.get('model')
- system_prompt = data.get('system_prompt', "You are a helpful, friendly AI assistant.")
- if not user_input_text:
- return jsonify({"error": "No input text provided"}), 400
- available_models = get_ollama_models()
- if selected_model not in available_models:
- selected_model = available_models[0] if available_models else "llama2:latest"
- try:
- ollama_payload = {
- "model": selected_model,
- "prompt": user_input_text,
- "system": system_prompt,
- "stream": False
- }
- print(f"Sending typed text to Ollama: {ollama_payload}")
- ollama_response = requests.post(OLLAMA_API_URL, json=ollama_payload)
- ollama_response.raise_for_status()
- llm_response_text = ollama_response.json().get("response", "")
- print(f"Ollama response to typed text: '{llm_response_text}'")
- except requests.RequestException as e:
- print(f"Ollama API error (typed text): {str(e)}")
- return jsonify({"error": f"Ollama API error: {str(e)}"}), 500
- tts_audio_base64 = None
- if tts_pipeline and llm_response_text:
- try:
- generator = tts_pipeline(llm_response_text, voice=VOICE)
- audio_chunks = [audio for _, _, audio in generator]
- if audio_chunks:
- final_audio_np = np.concatenate(audio_chunks)
- wav_buffer = io.BytesIO()
- sf.write(wav_buffer, final_audio_np, 24000, format='WAV', subtype='PCM_16')
- wav_buffer.seek(0)
- tts_audio_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
- print("Kokoro TTS for typed text generated.")
- except Exception as e:
- print(f"Kokoro TTS generation error (typed text): {str(e)}")
- llm_response_text += " (TTS Error)"
- return jsonify({
- "transcribed_text": None,
- "text": llm_response_text,
- "audio": tts_audio_base64
- })
- if __name__ == '__main__':
- app.run(host='0.0.0.0', port=5000, debug=True)
Add Comment
Please, Sign In to add comment