Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tkinter as tk
- from tkinter import ttk, filedialog
- import wave
- import webrtcvad
- import collections
- import contextlib
- import sys
- import speech_recognition as sr
- import subprocess
- import os
- import threading
- import multiprocessing as mp
- import time
- import io
- AV_FORMAT = ['mp4', 'avi', 'mkv', 'wmv', 'flac', 'wav', 'mp3', 'aac', 'ac3', 'rmvb', 'flv', 'ts', 'mts']
- FFMPEG_PATH = 'bin/ffmpeg.exe'
- SAMPLE_RATE = 32000
- LENGTH_CAP = 10
- FRAME_DURATION = 30
- PADDING_DURATION = 300
- MAX_SEGMENT_DURATION = 5000 # ms
- LANG = 'zh-TW'
- def read_wave(path):
- with contextlib.closing(wave.open(path, 'rb')) as wf:
- num_channels = wf.getnchannels()
- assert num_channels == 1
- sample_width = wf.getsampwidth()
- assert sample_width == 2
- sample_rate = wf.getframerate()
- assert sample_rate in (8000, 16000, 32000)
- pcm_data = wf.readframes(wf.getnframes())
- return pcm_data, sample_rate
- class Frame(object):
- def __init__(self, bytes, timestamp, duration):
- self.bytes = bytes
- self.timestamp = timestamp
- self.duration = duration
- def frame_generator(frame_duration_ms, audio, sample_rate):
- n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
- offset = 0
- timestamp = 0.0
- duration = (float(n) / sample_rate) / 2.0
- while offset + n < len(audio):
- yield Frame(audio[offset:offset + n], timestamp, duration)
- timestamp += duration
- offset += n
- def vad_collector(sample_rate, frame_duration_ms,
- padding_duration_ms, vad, frames, max_frame_duration_ms):
- num_padding_frames = int(padding_duration_ms / frame_duration_ms)
- ring_buffer = collections.deque(maxlen=num_padding_frames)
- max_seg_frame = int(max_frame_duration_ms / frame_duration_ms)
- triggered = False
- voiced_frames = []
- for frame in frames:
- if not triggered:
- ring_buffer.append(frame)
- num_voiced = len([f for f in ring_buffer
- if vad.is_speech(f.bytes, sample_rate)])
- if num_voiced > 0.9 * ring_buffer.maxlen:
- begin = ring_buffer[0].timestamp
- triggered = True
- voiced_frames.extend(ring_buffer)
- ring_buffer.clear()
- else:
- voiced_frames.append(frame)
- ring_buffer.append(frame)
- num_unvoiced = len([f for f in ring_buffer
- if not vad.is_speech(f.bytes, sample_rate)])
- if num_unvoiced > 0.9 * ring_buffer.maxlen or len(voiced_frames) > max_seg_frame:
- end = frame.timestamp + frame.duration
- triggered = False
- yield b''.join([f.bytes for f in voiced_frames]), begin, end
- ring_buffer.clear()
- voiced_frames = []
- if triggered:
- end = frame.timestamp + frame.duration
- if voiced_frames:
- yield b''.join([f.bytes for f in voiced_frames]), begin, end
- def getFilenameExt(filename):
- s = str(filename).split('.')
- return s[len(s) - 1].lower()
- def changeFilenameExt(path, newExt):
- path = str(path)
- pos = 0;
- for i in range(0, len(path)):
- if (path[i] == '.'):
- pos = i
- path = path[0:pos + 1]
- return path + str(newExt)
- class ffmpegProcess(threading.Thread):
- def __init__(self, file_path):
- threading.Thread.__init__(self)
- self.file_path = file_path
- def run(self):
- sp = subprocess.Popen(
- FFMPEG_PATH + " -y -i \"%s\" -c copy -vn -acodec pcm_s16le -ar %d -ac 1 \"%s\"" %
- (self.file_path, SAMPLE_RATE, changeFilenameExt(self.file_path, 'wav'))
- )
- self.stdout, self.stderr = sp.communicate()
- def extractAudio(file_path):
- ext = getFilenameExt(file_path)
- if ext in AV_FORMAT:
- pFFmepg = ffmpegProcess(file_path)
- pFFmepg.start()
- pFFmepg.join()
- else:
- print('Not support file')
- sys.exit(1)
- def recognize_mp(segments, samprate, lang, window, num_process):
- manager = mp.Manager()
- subtitles = manager.list()
- mpPool = mp.Pool(processes=int(num_process))
- progress = manager.Value('i', 0)
- lock = manager.Lock()
- total = 0
- window.progressbar['value'] = 0
- window.update()
- for segment, begin, end in segments:
- total += 1
- mpPool.apply_async(APIProcess.recognize,
- args=(segment, begin, end, samprate, lang, subtitles, progress, lock))
- mpPool.close()
- window.progressbar.config(maximum=total)
- while progress.value < total:
- window.progressbar['value'] = progress.value
- window.update()
- # print('%d/%d'%(progress.value,total))
- time.sleep(0.1)
- mpPool.join()
- window.progressbar['value'] = total
- window.update()
- return list(subtitles)
- class APIProcess(threading.Thread):
- def __init__(self, lang, samprate, maxlength, sensetive, progress, window, num_process):
- threading.Thread.__init__(self)
- self.lang = lang
- self.samprate = int(samprate)
- self.maxlength = int(maxlength)
- self.sensetive = int(sensetive)
- self.progress = progress
- self.window = window
- self.start()
- self.num_process = num_process
- @staticmethod
- def recognize(segment, begin, end, samprate, lang, subtitles, progess, lock):
- r = sr.Recognizer()
- audio = sr.AudioData(segment, samprate, 4)
- try:
- subtitle = r.recognize_google(audio, language=lang)
- print(subtitle)
- subtitles.append((begin, end, subtitle))
- finally:
- lock.acquire()
- progess.value += 1
- lock.release()
- return
- def run(self):
- self.window.startButton.config(state='disable')
- self.window.update()
- progress = self.progress
- file_list = filedialog.askopenfilenames()
- complete = 0
- for file_path in file_list:
- self.window.root.title('ScribeArbeit - %s' % (file_path))
- progress.set('Extracting audio...(%d/%d)' % (complete, len(file_list)))
- self.window.update()
- extractAudio(file_path)
- progress.set('Generating subtitles...(%d/%d)' % (complete, len(file_list)))
- self.window.update()
- tmp_path = changeFilenameExt(file_path, 'wav')
- audio, sample_rate = read_wave(tmp_path)
- vad = webrtcvad.Vad(self.sensetive)
- frames = frame_generator(FRAME_DURATION, audio, sample_rate)
- frames = list(frames)
- segments = vad_collector(sample_rate, FRAME_DURATION, PADDING_DURATION, vad, frames, self.maxlength)
- subtitles = recognize_mp(segments, self.samprate, self.lang, self.window, self.num_process)
- srt = io.open(changeFilenameExt(file_path, 'srt'), 'wb')
- subtitles.sort()
- index = 0
- for sub in subtitles:
- begin = sub[0]
- end = sub[1]
- srt.write(("%d\r\n" % (index)).encode(encoding='utf8'))
- srt.write(("%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n" % (
- begin / 3600, begin / 60, begin % 60, begin % 1 * 1000,
- end / 3600, end / 60, end % 60, end % 1 * 1000)
- ).encode(encoding='utf8'))
- srt.write((sub[2] + '\r\n\r\n').encode(encoding='utf8'))
- index += 1
- srt.close()
- os.remove(tmp_path)
- complete += 1
- progress.set('Complete!(%d/%d)' % (complete, len(file_list)))
- self.window.root.title('ScribeArbeit - %s' % ('All complete!'))
- self.window.startButton.config(state='normal')
- self.window.update()
- self.window.update()
- class MainWindow(tk.Frame):
- def __init__(self, root=None, daemon=None):
- tk.Frame.__init__(self, root)
- self.root = root
- self.labelLang = ttk.Label(root, text='Language : ')
- self.labelLang.grid(column=0, row=0)
- self.comboLang = ttk.Combobox(root)
- self.comboLang['values'] = ('zh-TW', 'en-US', 'ja-JP')
- self.comboLang.current(0)
- self.comboLang.grid(column=1, row=0)
- self.labelSample = ttk.Label(root, text='Sampling Rate : ')
- self.labelSample.grid(column=0, row=1)
- self.comboSample = ttk.Combobox(root)
- self.comboSample['values'] = (8000, 16000, 32000)
- self.comboSample.current(1)
- self.comboSample.grid(column=1, row=1)
- self.labelMaxLen = ttk.Label(root, text='Max Segment Length(ms) : ')
- self.labelMaxLen.grid(column=0, row=2)
- self.entryMaxLen = ttk.Entry(root)
- self.entryMaxLen.insert(0, '5000')
- self.entryMaxLen.grid(column=1, row=2)
- self.labelSense = ttk.Label(root, text='Sensitive')
- self.labelSense.grid(column=0, row=3)
- self.comboSense = ttk.Combobox(root)
- self.comboSense['values'] = (0, 1, 2, 3)
- self.comboSense.grid(column=1, row=3)
- self.comboSense.current(1)
- self.labelProcess = ttk.Label(root, text='Processes')
- self.labelProcess.grid(column=0, row=4)
- self.comboProcess = ttk.Combobox(root)
- self.comboProcess['values'] = tuple([i + 1 for i in range(32)])
- self.comboProcess.grid(column=1, row=4)
- self.comboProcess.current(7)
- self.stringVar = tk.StringVar()
- self.labelProgress = ttk.Label(root, textvariable=self.stringVar)
- self.labelProgress.grid(column=0, row=5)
- self.progressbar = ttk.Progressbar(root, length=400, maximum=100, mode='determinate')
- self.progressbar.grid(column=0, row=6, columnspan=2)
- self.startButton = ttk.Button(root, text='start', command=lambda:
- APIProcess(self.comboLang.get(), self.comboSample.get(), self.entryMaxLen.get(), self.comboSense.get(),
- self.stringVar, self, self.comboProcess.get()))
- self.startButton.grid(column=1, row=5)
- class Daemon:
- def __init__(self):
- self.root = tk.Tk()
- self.root.title('ScribeArbeit')
- self.mainWindow = MainWindow(self.root, self)
- self.mainWindow.mainloop()
- if __name__ == '__main__':
- mp.freeze_support()
- daemon = Daemon()
Add Comment
Please, Sign In to add comment