Untitled

# TODO (mb 2018-09-10): don't align last element on line if
#   followed by a comment
# TODO (mb 2018-09-09): respect fmt: off, fmt: on
# TODO (mb 2018-09-08): replace in python context r"\[\"\w[\w\d\-]*\"\]"
#   ie. mydict["mykey"] -> mydict['mykey']
# TODO (mb 2018-09-08): replace in dict context r"\[\"\w[\w\d\-]*\]\":"
#   ie. "mykey": "myval", -> 'mykey': "myval",


import re
from enum import Enum
from typing import *

FileContent = str


str_contents = """
#!/usr/bin/env python3
# fmt: on
# Some license here.
#
# Has many lines. Many, many lines.
# Many, many, many lines.
\"\"\"Module docstring.

Possibly also many, many lines.
\"\"\"


import os.path
import sys

import a
from b.c import X  # some noqa comment


def test_parse():
    \"\"\"Docstring comes first.

    Possibly many lines.
    \"\"\"
    # FIXME: Some comment about why this function is crap but still in production.

    environ = {
        "MYAPP_DB_HOST": "1.2.3.4",
        "MYAPP_DB_PORT": "1234",
        'MYAPP_DB_PASSWORD': "secret",
        'MYAPP_DB_READ_ONLY': "0",
        'MYAPP_DB_DDL': "~/mkdb.sql",
        'MYAPP_DL': 123_123,
        'MYAPP_DL': 123_123_929,
        'MYAPP_DBDL': 12,
    }

    barf = {
        22: 23_222,
        2234: 231_231_231_232,
        1234: 231_231_232,
    }

    dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")

    assert dbenv.host == "1.2.3.4"
    assert dbenv.user == "new_user"
    assert dbenv.password == "secret"

    assert dbenv.read_only is False
    assert isinstance(dbenv.ddl, pl.Path)
    assert str(dbenv.ddl).endswith("mkdb.sql")

    assert len(attrnames) == 7


GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}

'What\\'s the deal "here"?'
'And "here"?'

StrList = List[str]
PathList = List[Path]

class TestEnv(myenv.BaseEnv):
    '''
    notakey: notaval
    notakeyeither: notaval
    '''

    str_val_wo_default: str
    float_val_wo_default: float
    path_val_wo_default: pl.Path
    paths_val_wo_default: List[pl.Path]

    str_val: str = "foo"
    strs_val: StrList = ["foo = bar", "barfoo = baz"]
    float_val: float = 12.34
    path_val: pl.Path = pl.Path("file.txt")
    paths_val: PathList = [
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [1, 22, 2243, 4],

        {23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
        {1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
        {1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
    ]
    \"\"\"Docstring for instance attribute spam.\"\"\"
"""

# str_contents = open("blacker.py").read()
# str_contents = open("test.txt").read()


# fmt: off
ALIGN_BEFORE_TOKENS = {
    "<<=", ">>=", "**=", "//=",
    "+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=",
    "==", "!=", "<=", ">=",
    "//", "<<", ">>", "^=", "~=",
    "in", "is", "},", "],", "),",
    "->",
    ",", ":", "=",
    "+", "-", "*", "/",
    "%", "|", "&", "^", "~",
    "!", "<", ">",
    "}", "]", ")",
}
# fmt: on


NO_ALIGN_BLOCK_END_MATCHERS = {
    "'''": re.compile(r"(?<![^\\]\\)'''"),
    '"""': re.compile(r"(?<![^\\]\\)\"\"\""),
    '"'  : re.compile(r"(?<![^\\]\\)\""),
    "'"  : re.compile(r"(?<![^\\]\\)'"),
    "#": re.compile(r"$", flags=re.MULTILINE),
}


ALIGN_TOKEN_SEP_RE = re.compile(
    r"""
        (
            '''
            |\"\"\"
            |\"
            |'
            |\#

            |<<= |>>= |\*\*= |//=
            |\+= |\-= |\*= |/= |%= |\|= |&= |@=
            |== |!= |<= |>=
            |// |<< |>> |\^= |~=
            |(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\),
            |\->
            |, |: |=
            |\+ |\- |\* |/
            |% |\| |& |\^ |~
            |! |< |>
            |\{ |\[ |\(
            |\} |\] |\)

            |$
        )
    """,
    flags=re.MULTILINE | re.VERBOSE,
)


class TokenType(Enum):

    INDENT    = 0
    SEPARATOR = 1
    CODE      = 2
    BLOCK     = 3
    NEWLINE   = 4


TokenVal = str


class Token(NamedTuple):

    typ: TokenType
    val: TokenVal


def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
    rest = src_contents
    prev_rest = None

    while rest:
        assert rest != prev_rest, "No progress at: " + repr(rest[:40])
        prev_rest = rest

        curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
        assert curr_token_sep is not None
        curr_token_start, curr_token_end = curr_token_sep.span()

        # newline match has zero width
        is_newline = curr_token_start == curr_token_end
        if is_newline:
            # adjust for zero width match
            curr_token_end = curr_token_start + 1

            # Get everything (if anything) up to (and excluding) the newline
            token_val = rest[:curr_token_start]
            if token_val:
                assert token_val != "\n"
                yield Token(TokenType.CODE, token_val)

            # The newline itself (note that black promises to
            # have normalized CRLF etc. to plain LF)
            token_val = rest[curr_token_start:curr_token_end]
            assert token_val == "\n"
            yield Token(TokenType.NEWLINE, token_val)

            rest = rest[curr_token_end:]
            # parse any indent
            new_rest = rest.lstrip(" \t")
            indent_len = len(rest) - len(new_rest)
            if indent_len > 0:
                indent_token_val = rest[:indent_len]
                yield Token(TokenType.INDENT, indent_token_val)
                rest = new_rest
        elif curr_token_start > 0:
            prev_token_val = rest[:curr_token_start]
            rest = rest[curr_token_start:]
            assert prev_token_val != "\n"
            assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
            yield Token(TokenType.CODE, prev_token_val)
        else:
            token_val = curr_token_sep.group(0)
            if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
                # comment, string or docstring
                block_begin_val = token_val
                assert curr_token_end > 0
                rest = rest[len(block_begin_val) :]
                end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
                block_end_match = end_matcher.search(rest)
                assert block_end_match, rest[:40]
                block_end_token = block_end_match.group(0)
                block_end_index = block_end_match.span()[-1]
                assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
                block_rest = rest[:block_end_index]
                block_token_val = block_begin_val + block_rest
                assert block_token_val.endswith(block_end_token)
                yield Token(TokenType.BLOCK, block_token_val)
                rest = rest[block_end_index:]
            else:
                sep_token_val = token_val
                yield Token(TokenType.SEPARATOR, sep_token_val)
                rest = rest[curr_token_end:]

            # NOTE (mb 2018-09-09): The way we tokenize, we always consume
            #   all content belonging to strings and comments. This means that
            #   the rest (after consuming all content of a string or comment),
            #   should continue to be valid python. This means we can do some
            #   basic sanity checks. For example, no valid python token begins
            #   with a questionmark (though this is actually introduced because
            #   one of the test cases conveniently has a questionmark as the
            #   first character after an edge case of string parsing).
            assert not rest.startswith("?"), repr(rest)


Indent      = str
RowIndex    = int
ColIndex    = int
OffsetWidth = int

TokenTable = List[List[Token]]


class RowLayoutToken(NamedTuple):
    """Disambiguate between lines with different layout/structure

    We only want to align lines which have the same structure of
    indent and separators. Any difference in the number of elements
    or type of separators causes alignment to be disabled.
    """

    typ: TokenType
    # val is only set if it should cause a different prefix
    # eg. if a separator is a comma vs a period.
    val: Optional[TokenVal]


# Tokens which have values which are relevant to to the layout of
# a cell group.
LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])

RowLayoutTokens = Tuple[RowLayoutToken, ...]


class AlignmentContextKey(NamedTuple):
    """Does not change between multiple lines that can be aligned."""

    col_idx: ColIndex
    tok_typ: TokenType
    tok_val: TokenVal
    layout : RowLayoutTokens


AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]


class AlignmentCellKey(NamedTuple):
    last_row_index: RowIndex
    col_index     : ColIndex
    token_val     : TokenVal
    layout        : RowLayoutTokens


class AlignmentCell(NamedTuple):
    row_idx: RowIndex
    offset_width: OffsetWidth


CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]


def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
    for row in table:
        ctx: AlignmentContext = {}
        layout: RowLayoutTokens = tuple()

        for col_index, token in enumerate(row):
            layou_ttoken_val: Optional[TokenVal]
            if token.typ in LAYOUT_VAL_TOKENS:
                if token.val in ALIGN_BEFORE_TOKENS:
                    layou_ttoken_val = token.val
                elif col_index > 0:
                    # Layout tokens such as ([{ don't cause alignment, to
                    # their preceding token, so line offset up to the
                    # column of those tokens can a be different. We only
                    # want to continue with alignment if the tokens are
                    # all at the same line offset.
                    layou_ttoken_val = token.val + f"::{len(row[col_index - 1].val)}"
                else:
                    layou_ttoken_val = None
            else:
                layou_ttoken_val = None

            layout += (RowLayoutToken(token.typ, layou_ttoken_val),)

            if token.val in ALIGN_BEFORE_TOKENS:
                assert token.typ == TokenType.SEPARATOR
                maybe_indent_token = row[0]
                if maybe_indent_token.typ == TokenType.INDENT:
                    indent = maybe_indent_token.val
                else:
                    indent = ""
                prev_token = row[col_index - 1]
                if prev_token.typ == TokenType.SEPARATOR:
                    continue

                offset_width = len(prev_token.val)
                ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
                ctx[ctx_key] = offset_width

        yield ctx


def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
    cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
    for row_index, ctx in enumerate(alignment_contexts):
        ctx_items = sorted(ctx.items())
        for ctx_key, offset_width in ctx_items:
            col_index, token_typ, token_val, layout = ctx_key
            prev_cell_key = AlignmentCellKey(
                row_index - 1, col_index, token_val, layout
            )
            curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)

            curr_cell = AlignmentCell(row_index, offset_width)

            if prev_cell_key in cell_groups:
                prev_cells = cell_groups[prev_cell_key]
                del cell_groups[prev_cell_key]
                cell_groups[curr_cell_key] = prev_cells + [curr_cell]
            else:
                cell_groups[curr_cell_key] = [curr_cell]

    return cell_groups


def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
    prev_col_index = -1
    for ctx_key, cells in sorted(cell_groups.items()):
        prev_col_index = ctx_key.col_index
        if len(cells) < 3:
            continue

        max_offset_width = max(ow for _, ow in cells)
        for row_index, offset_width in cells:
            extra_offset = max_offset_width - offset_width
            if extra_offset == 0:
                continue

            row = table[row_index]
            left_token = row[ctx_key.col_index - 1]
            maybe_number = left_token.val.strip().replace("_", "")
            if maybe_number.isdigit():
                padded_left_token_val = " " * extra_offset + left_token.val
            elif row[ctx_key.col_index + 1].typ == TokenType.NEWLINE:
                # don't align if this is the last token of the row
                continue
            else:
                padded_left_token_val = left_token.val + " " * extra_offset
            padded_token = Token(TokenType.CODE, padded_left_token_val)
            row[ctx_key.col_index - 1] = padded_token

    return "".join("".join(token.val for token in row) for row in table)


def align_formatted_str(src_contents: str) -> FileContent:
    debug = 0

    table: TokenTable = [[]]
    for token in tokenize_for_alignment(src_contents):
        if debug:
            print("TOKEN: ", repr(token.val).ljust(50), token)
        table[-1].append(token)
        if token.typ == TokenType.NEWLINE:
            table.append([])
        else:
            assert token.typ == TokenType.BLOCK or "\n" not in token.val

    if debug:
        for row in table:
            print("ROW: ", end="")
            for tok_cell in row:
                print(tok_cell, end="\n     ")
            print()

    alignment_contexts = list(find_alignment_contexts(table))
    cell_groups = find_cell_groups(alignment_contexts)

    if debug:
        for cell_key, cells in cell_groups.items():
            if len(cells) > 1:
                print("CELL", len(cells), cell_key)
                for all_cell in cells:
                    print("\t\t", all_cell)

    return realigned_contents(table, cell_groups)


print(align_formatted_str(str_contents))