scribe-json-to-ass

# ver.4 (Apr 18, 2025)

import msgspec
import pysubs2

path = r"MIRU\ep3"
max_word_duration = 1  # in seconds

punc1 = ["。", "！", "？", ".", "!", "?"]
punc2 = ["、", "，", "、", ","]
punc3 = ["…"]
punc = punc1 + punc2

scribe = msgspec.json.decode(open(rf"{path}\scribe.json", "rb").read())
# words = sum([segment["words"] for segment in scribe["segments"]], [])  # web ui
words = scribe["words"]  # api


def events_to_string(events):
    ssa_file = pysubs2.SSAFile()
    ssa_file.events = events
    return (
        ssa_file.to_string(format_="ass")
        .split("\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n")[1]
        .strip()
    )


subs = pysubs2.SSAFile()
style = pysubs2.SSAStyle(
    fontname="IPAexGothic",
    fontsize=40,
    primarycolor="&H00FFFFFF",
    secondarycolor="&H00FFD500",
    outlinecolor="&H00000000",
    backcolor="&H004A4A4A",
    bold=True,
    alignment=2,
    shadow=0,
)
subs.styles["ja"] = style
subs.info["PlayResX"] = "1920"
subs.info["PlayResY"] = "1080"

current_line = []

words = [w for w in words if w["type"] != "audio_event"]
for word in words:
    word["text"] = word["text"].replace("...", "…")

# if there are multiple words with text "・" in a row, replace them with a combined word with text "…" and the start time of the first word and end time of the last word. similar for "-"
# ------------------------------------------
i = 0
while i < len(words) - 1:
    if words[i]["text"] == "・" and words[i + 1]["text"] == "・":
        start_idx = i
        end_idx = i
        while end_idx < len(words) - 1 and words[end_idx + 1]["text"] == "・":
            end_idx += 1

        combined_word = dict(words[i])
        combined_word["text"] = "…"
        combined_word["start"] = words[start_idx]["start"]
        combined_word["end"] = words[end_idx]["end"]

        words[start_idx : end_idx + 1] = [combined_word]
        # no need to increment i since we've modified the list
    else:
        i += 1

i = 0
while i < len(words) - 1:
    if words[i]["text"] == "-" and words[i + 1]["text"] == "-":
        start_idx = i
        end_idx = i
        while end_idx < len(words) - 1 and words[end_idx + 1]["text"] == "-":
            end_idx += 1

        combined_word = dict(words[i])
        combined_word["text"] = "——"
        combined_word["start"] = words[start_idx]["start"]
        combined_word["end"] = words[end_idx]["end"]

        words[start_idx : end_idx + 1] = [combined_word]
        # no need to increment i since we've modified the list
    else:
        i += 1
# ------------------------------------------

# iterate over all word in words. check if any punctation appears not in the end of word["text"] if word["type"]!="audio_event" (and word["text"]>1). for example, if word["text"] is "aaa.bbb", split it into 3 word dictionaries with text "aaa", ".", "bbb", corrsponding start and end time inferred from text length (punctation seen as taking zero time)
# ------------------------------------------
processed_words = []

for word in words:
    if word["type"] == "audio_event" or len(word["text"]) <= 1:
        processed_words.append(word)
        continue

    text = word["text"]
    segments = []
    current_segment = ""

    for char in text:
        if char in punc:
            if current_segment:
                segments.append(current_segment)
            segments.append(char)
            current_segment = ""
        else:
            current_segment += char

    if current_segment:
        segments.append(current_segment)

    if len(segments) <= 1:
        processed_words.append(word)
        continue

    non_punct_segments = [seg for seg in segments if seg not in punc]
    total_length = sum(len(seg) for seg in non_punct_segments) if non_punct_segments else 0
    start_time = word["start"]
    total_duration = word["end"] - word["start"]

    for segment in segments:
        new_word = dict(word)
        new_word["text"] = segment

        if segment in punc:
            new_word["start"] = start_time
            new_word["end"] = start_time
        else:
            segment_duration = total_duration * (len(segment) / total_length) if total_length > 0 else 0
            end_time = start_time + segment_duration
            new_word["start"] = start_time
            new_word["end"] = end_time
            start_time = end_time

        processed_words.append(new_word)

words = processed_words
# ------------------------------------------

flag_dict = {}

for i, word in enumerate(words):
    current_line.append(word)

    # if we encounter punctuation that suggests end of line, or speaker changes, or it's the last word
    if (
        (word["text"][-1] in punc1 + punc3)
        or (word["text"][-1] in punc2 and sum(len(i["text"]) for i in current_line) >= 15)
        or (i < len(words) - 1 and word["speaker_id"] != words[i + 1]["speaker_id"])
        or word == words[-1]
    ):
        normal_text = ""
        karaoke_text = ""
        speaker_id = current_line[0].get("speaker_id", "")

        duration_list = []

        for w in current_line:
            while len(w["text"]) > 0 and w["text"][-1] in punc1:
                w["text"] = w["text"][:-1]
            for p in punc2:
                w["text"] = w["text"].replace(p, "　")

        # if there are multiple words with text being space (" " or "　") in a row, replace them with a combined word with text "　" and the start time of the first word and end time of the last word.
        # ------------------------------------------
        j = 0
        while j < len(current_line) - 1:
            if current_line[j]["text"] in [" ", "　"] and current_line[j + 1]["text"] in [" ", "　"]:
                start_idx2 = j
                end_idx2 = j
                while end_idx2 < len(current_line) - 1 and current_line[end_idx2 + 1]["text"] in [" ", "　"]:
                    end_idx2 += 1

                combined_space = dict(current_line[start_idx2])
                combined_space["text"] = "　"
                combined_space["start"] = current_line[start_idx2]["start"]
                combined_space["end"] = current_line[end_idx2]["end"]

                current_line[start_idx2 : end_idx2 + 1] = [combined_space]
            else:
                j += 1
        # ------------------------------------------

        while current_line and not current_line[0]["text"].strip():
            current_line = current_line[1:]
        while current_line and not current_line[-1]["text"].strip():
            current_line = current_line[:-1]

        if (
            current_line
            and sum(len(w["text"].strip()) for w in current_line) > 0
            and current_line[-1]["end"] - current_line[0]["start"] > 0
        ):
            if current_line[0]["end"] - current_line[0]["start"] > max_word_duration:
                current_line[0]["start"] = current_line[0]["end"] - max_word_duration

            if current_line[-1]["end"] - current_line[-1]["start"] > max_word_duration:
                current_line[-1]["end"] = current_line[-1]["start"] + max_word_duration

            for w in current_line:
                duration = w["end"] - w["start"]
                duration_list.append(duration)
                normal_text += f"{w['text']}"
                karaoke_text += f"{{\\kf{round(duration * 100)}}}{w['text']}"

            if normal_text.strip():
                event = pysubs2.SSAEvent(
                    start=current_line[0]["start"] * 1000,
                    end=current_line[-1]["end"] * 1000,
                    text=karaoke_text.strip(),
                    style="ja",
                    name=speaker_id.lstrip("Speaker ").lstrip("speaker_"),
                )
                subs.append(event)
                if max(duration_list) > max_word_duration:
                    if len(subs) - 1 not in flag_dict:
                        flag_dict[len(subs) - 1] = []
                    flag_dict[len(subs) - 1].append("duration")

        current_line = []

for i, e in enumerate(subs):
    if i in flag_dict:
        print(f"[{i + 1}] {' '.join(flag_dict[i])}\n{events_to_string([e])}")
        pass

subs.save(rf"{path}\scribe.ass")
subs.save(rf"{path}\scribe.srt")