Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ver.4 (Apr 18, 2025)
- import msgspec
- import pysubs2
- path = r"MIRU\ep3"
- max_word_duration = 1 # in seconds
- punc1 = ["。", "!", "?", ".", "!", "?"]
- punc2 = ["、", ",", "、", ","]
- punc3 = ["…"]
- punc = punc1 + punc2
- scribe = msgspec.json.decode(open(rf"{path}\scribe.json", "rb").read())
- # words = sum([segment["words"] for segment in scribe["segments"]], []) # web ui
- words = scribe["words"] # api
- def events_to_string(events):
- ssa_file = pysubs2.SSAFile()
- ssa_file.events = events
- return (
- ssa_file.to_string(format_="ass")
- .split("\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n")[1]
- .strip()
- )
- subs = pysubs2.SSAFile()
- style = pysubs2.SSAStyle(
- fontname="IPAexGothic",
- fontsize=40,
- primarycolor="&H00FFFFFF",
- secondarycolor="&H00FFD500",
- outlinecolor="&H00000000",
- backcolor="&H004A4A4A",
- bold=True,
- alignment=2,
- shadow=0,
- )
- subs.styles["ja"] = style
- subs.info["PlayResX"] = "1920"
- subs.info["PlayResY"] = "1080"
- current_line = []
- words = [w for w in words if w["type"] != "audio_event"]
- for word in words:
- word["text"] = word["text"].replace("...", "…")
- # if there are multiple words with text "・" in a row, replace them with a combined word with text "…" and the start time of the first word and end time of the last word. similar for "-"
- # ------------------------------------------
- i = 0
- while i < len(words) - 1:
- if words[i]["text"] == "・" and words[i + 1]["text"] == "・":
- start_idx = i
- end_idx = i
- while end_idx < len(words) - 1 and words[end_idx + 1]["text"] == "・":
- end_idx += 1
- combined_word = dict(words[i])
- combined_word["text"] = "…"
- combined_word["start"] = words[start_idx]["start"]
- combined_word["end"] = words[end_idx]["end"]
- words[start_idx : end_idx + 1] = [combined_word]
- # no need to increment i since we've modified the list
- else:
- i += 1
- i = 0
- while i < len(words) - 1:
- if words[i]["text"] == "-" and words[i + 1]["text"] == "-":
- start_idx = i
- end_idx = i
- while end_idx < len(words) - 1 and words[end_idx + 1]["text"] == "-":
- end_idx += 1
- combined_word = dict(words[i])
- combined_word["text"] = "——"
- combined_word["start"] = words[start_idx]["start"]
- combined_word["end"] = words[end_idx]["end"]
- words[start_idx : end_idx + 1] = [combined_word]
- # no need to increment i since we've modified the list
- else:
- i += 1
- # ------------------------------------------
- # iterate over all word in words. check if any punctation appears not in the end of word["text"] if word["type"]!="audio_event" (and word["text"]>1). for example, if word["text"] is "aaa.bbb", split it into 3 word dictionaries with text "aaa", ".", "bbb", corrsponding start and end time inferred from text length (punctation seen as taking zero time)
- # ------------------------------------------
- processed_words = []
- for word in words:
- if word["type"] == "audio_event" or len(word["text"]) <= 1:
- processed_words.append(word)
- continue
- text = word["text"]
- segments = []
- current_segment = ""
- for char in text:
- if char in punc:
- if current_segment:
- segments.append(current_segment)
- segments.append(char)
- current_segment = ""
- else:
- current_segment += char
- if current_segment:
- segments.append(current_segment)
- if len(segments) <= 1:
- processed_words.append(word)
- continue
- non_punct_segments = [seg for seg in segments if seg not in punc]
- total_length = sum(len(seg) for seg in non_punct_segments) if non_punct_segments else 0
- start_time = word["start"]
- total_duration = word["end"] - word["start"]
- for segment in segments:
- new_word = dict(word)
- new_word["text"] = segment
- if segment in punc:
- new_word["start"] = start_time
- new_word["end"] = start_time
- else:
- segment_duration = total_duration * (len(segment) / total_length) if total_length > 0 else 0
- end_time = start_time + segment_duration
- new_word["start"] = start_time
- new_word["end"] = end_time
- start_time = end_time
- processed_words.append(new_word)
- words = processed_words
- # ------------------------------------------
- flag_dict = {}
- for i, word in enumerate(words):
- current_line.append(word)
- # if we encounter punctuation that suggests end of line, or speaker changes, or it's the last word
- if (
- (word["text"][-1] in punc1 + punc3)
- or (word["text"][-1] in punc2 and sum(len(i["text"]) for i in current_line) >= 15)
- or (i < len(words) - 1 and word["speaker_id"] != words[i + 1]["speaker_id"])
- or word == words[-1]
- ):
- normal_text = ""
- karaoke_text = ""
- speaker_id = current_line[0].get("speaker_id", "")
- duration_list = []
- for w in current_line:
- while len(w["text"]) > 0 and w["text"][-1] in punc1:
- w["text"] = w["text"][:-1]
- for p in punc2:
- w["text"] = w["text"].replace(p, " ")
- # if there are multiple words with text being space (" " or " ") in a row, replace them with a combined word with text " " and the start time of the first word and end time of the last word.
- # ------------------------------------------
- j = 0
- while j < len(current_line) - 1:
- if current_line[j]["text"] in [" ", " "] and current_line[j + 1]["text"] in [" ", " "]:
- start_idx2 = j
- end_idx2 = j
- while end_idx2 < len(current_line) - 1 and current_line[end_idx2 + 1]["text"] in [" ", " "]:
- end_idx2 += 1
- combined_space = dict(current_line[start_idx2])
- combined_space["text"] = " "
- combined_space["start"] = current_line[start_idx2]["start"]
- combined_space["end"] = current_line[end_idx2]["end"]
- current_line[start_idx2 : end_idx2 + 1] = [combined_space]
- else:
- j += 1
- # ------------------------------------------
- while current_line and not current_line[0]["text"].strip():
- current_line = current_line[1:]
- while current_line and not current_line[-1]["text"].strip():
- current_line = current_line[:-1]
- if (
- current_line
- and sum(len(w["text"].strip()) for w in current_line) > 0
- and current_line[-1]["end"] - current_line[0]["start"] > 0
- ):
- if current_line[0]["end"] - current_line[0]["start"] > max_word_duration:
- current_line[0]["start"] = current_line[0]["end"] - max_word_duration
- if current_line[-1]["end"] - current_line[-1]["start"] > max_word_duration:
- current_line[-1]["end"] = current_line[-1]["start"] + max_word_duration
- for w in current_line:
- duration = w["end"] - w["start"]
- duration_list.append(duration)
- normal_text += f"{w['text']}"
- karaoke_text += f"{{\\kf{round(duration * 100)}}}{w['text']}"
- if normal_text.strip():
- event = pysubs2.SSAEvent(
- start=current_line[0]["start"] * 1000,
- end=current_line[-1]["end"] * 1000,
- text=karaoke_text.strip(),
- style="ja",
- name=speaker_id.lstrip("Speaker ").lstrip("speaker_"),
- )
- subs.append(event)
- if max(duration_list) > max_word_duration:
- if len(subs) - 1 not in flag_dict:
- flag_dict[len(subs) - 1] = []
- flag_dict[len(subs) - 1].append("duration")
- current_line = []
- for i, e in enumerate(subs):
- if i in flag_dict:
- print(f"[{i + 1}] {' '.join(flag_dict[i])}\n{events_to_string([e])}")
- pass
- subs.save(rf"{path}\scribe.ass")
- subs.save(rf"{path}\scribe.srt")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement