ComfyUI - Streaming LLM Text Generator Custom Node

# ~/ComfyUI/custom_nodes/llm_stream_node.py
import requests, json, sys

class LLMStreamNode:
    CATEGORY      = "LLM"
    OUTPUT_NODE   = True
    RETURN_TYPES  = ("STRING",)
    FUNCTION      = "generate"

    DESCRIPTION = (
        "Connect to any OpenAI-compatible LLM endpoint.\n\n"
        "  •  Set 'Max Tokens' to 0 for no limit.\n"
        "  •  'Deterministic' forces greedy sampling.\n"
        "  •  Disable 'Enable Thinking' to inject '/no_think'.\n"
        "  •  'Stream console' to stream tokens in ComfyUI console."
    )

    # ---------------- strip reasoning helper -------------
    @staticmethod
    def _strip(txt, start_tag, end_tag):
        out, i = "", 0
        while (s := txt.find(start_tag, i)) != -1:
            out += txt[i:s]
            e = txt.find(end_tag, s)
            if e == -1:
                break
            i = e + len(end_tag)
        out += txt[i:]
        return out.lstrip()

    # ---------------- UI spec ----------------------------
    @classmethod
    def INPUT_TYPES(cls):
        return {"required": {
            "endpoint": ("STRING", {
                "default": "http://127.0.0.1:5000",
                "tooltip": "Base URL of OpenAI-compatible LLM server (no /v1/…)"
            }),
            "prompt": ("STRING", {"multiline": True}),
            "max_tokens": ("INT", {
                "default": 0, "min": 0, "max": 32768, "step": 512,
                "tooltip": "Maximum tokens to generate; set to 0 to disable any limit"
            }),
            "seed": ("INT", {"default": 1, "min": -1, "max": 2**31-1}),
            "deterministic": ("BOOLEAN", {
                "default": False,
                "tooltip": "Force temperature=0, top_p=1, top_k=0, repeat_penalty=1.0"
            }),
            "enable_thinking": ("BOOLEAN", {
                "default": True,
                "tooltip": "If False, injects `/no_think` as a system message"
            }),
            "stream_console": ("BOOLEAN", {
                "default": True,
                "tooltip": "Print full token stream (thinking + answer) to console"
            }),
            "think_start_tag": ("STRING", {"default": "<think>"}),
            "think_end_tag":   ("STRING", {"default": "</think>"})
        }}

    # ---------------- main -------------------------------
    def generate(self, endpoint, prompt, max_tokens, seed,
                 deterministic, enable_thinking, stream_console,
                 think_start_tag, think_end_tag):

        url = f"{endpoint.rstrip('/')}/v1/chat/completions"
        sys_prompt = "/no_think" if not enable_thinking else "/think"
        body = {
            "messages": [
                {"role": "system", "content": sys_prompt},
                {"role": "user",   "content": prompt}
            ],
            "stream": stream_console
        }
        if max_tokens > 0:
            body["max_tokens"] = max_tokens

        # only set sampler params in deterministic mode
        if deterministic:
            body.update(dict(
                temperature=0,
                top_p=1,
                top_k=0,
                repeat_penalty=1.0
            ))

        if seed >= 0:
            body["seed"] = seed

        # -------- streaming branch --------
        if stream_console:
            answer, thinking = "", False
            try:
                with requests.post(url, json=body, stream=True, timeout=None) as r:
                    r.encoding = "utf-8"
                    r.raise_for_status()
                    for raw in r.iter_lines(decode_unicode=True):
                        if not raw or raw == "data: [DONE]" or not raw.startswith("data:"):
                            continue
                        delta = json.loads(raw[5:])["choices"][0].get("delta") or {}
                        text  = delta.get("content") or ""

                        if text:
                            sys.stdout.write(text)
                            sys.stdout.flush()

                        if think_start_tag in text:
                            thinking = True
                            continue
                        if think_end_tag in text:
                            thinking = False
                            answer += text.split(think_end_tag, 1)[1]
                            continue
                        if not thinking:
                            answer += text
                sys.stdout.write("\n")
            except Exception as e:
                answer = f"Error: {e}"
            return (answer.lstrip(),)

        # -------- blocking branch ---------
        try:
            r = requests.post(url, json=body, timeout=180)
            r.encoding = "utf-8"
            r.raise_for_status()
            raw = r.json()["choices"][0]["message"]["content"]
            answer = self._strip(raw, think_start_tag, think_end_tag)
        except Exception as e:
            answer = f"Error: {e}"
        return (answer.lstrip(),)

# register
NODE_CLASS_MAPPINGS        = {"LLMStream": LLMStreamNode}
NODE_DISPLAY_NAME_MAPPINGS = {"LLMStream": "Streaming LLM Text Generator"}