Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # TODO (mb 2018-09-10): don't align last element on line if
- # followed by a comment
- # TODO (mb 2018-09-09): respect fmt: off, fmt: on
- # TODO (mb 2018-09-08): replace in python context r"\[\"\w[\w\d\-]*\"\]"
- # ie. mydict["mykey"] -> mydict['mykey']
- # TODO (mb 2018-09-08): replace in dict context r"\[\"\w[\w\d\-]*\]\":"
- # ie. "mykey": "myval", -> 'mykey': "myval",
- import re
- from enum import Enum
- from typing import *
- FileContent = str
- str_contents = """
- #!/usr/bin/env python3
- # fmt: on
- # Some license here.
- #
- # Has many lines. Many, many lines.
- # Many, many, many lines.
- \"\"\"Module docstring.
- Possibly also many, many lines.
- \"\"\"
- import os.path
- import sys
- import a
- from b.c import X # some noqa comment
- def test_parse():
- \"\"\"Docstring comes first.
- Possibly many lines.
- \"\"\"
- # FIXME: Some comment about why this function is crap but still in production.
- environ = {
- "MYAPP_DB_HOST": "1.2.3.4",
- "MYAPP_DB_PORT": "1234",
- 'MYAPP_DB_PASSWORD': "secret",
- 'MYAPP_DB_READ_ONLY': "0",
- 'MYAPP_DB_DDL': "~/mkdb.sql",
- 'MYAPP_DL': 123_123,
- 'MYAPP_DL': 123_123_929,
- 'MYAPP_DBDL': 12,
- }
- barf = {
- 22: 23_222,
- 2234: 231_231_231_232,
- 1234: 231_231_232,
- }
- dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")
- assert dbenv.host == "1.2.3.4"
- assert dbenv.user == "new_user"
- assert dbenv.password == "secret"
- assert dbenv.read_only is False
- assert isinstance(dbenv.ddl, pl.Path)
- assert str(dbenv.ddl).endswith("mkdb.sql")
- assert len(attrnames) == 7
- GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}
- 'What\\'s the deal "here"?'
- 'And "here"?'
- StrList = List[str]
- PathList = List[Path]
- class TestEnv(myenv.BaseEnv):
- '''
- notakey: notaval
- notakeyeither: notaval
- '''
- str_val_wo_default: str
- float_val_wo_default: float
- path_val_wo_default: pl.Path
- paths_val_wo_default: List[pl.Path]
- str_val: str = "foo"
- strs_val: StrList = ["foo = bar", "barfoo = baz"]
- float_val: float = 12.34
- path_val: pl.Path = pl.Path("file.txt")
- paths_val: PathList = [
- [1, 2, 3, 4],
- [5, 6, 7, 8],
- [1, 22, 2243, 4],
- {23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
- {1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
- {1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
- ]
- \"\"\"Docstring for instance attribute spam.\"\"\"
- """
- # str_contents = open("blacker.py").read()
- # str_contents = open("test.txt").read()
- # fmt: off
- ALIGN_BEFORE_TOKENS = {
- "<<=", ">>=", "**=", "//=",
- "+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=",
- "==", "!=", "<=", ">=",
- "//", "<<", ">>", "^=", "~=",
- "in", "is", "},", "],", "),",
- "->",
- ",", ":", "=",
- "+", "-", "*", "/",
- "%", "|", "&", "^", "~",
- "!", "<", ">",
- "}", "]", ")",
- }
- # fmt: on
- NO_ALIGN_BLOCK_END_MATCHERS = {
- "'''": re.compile(r"(?<![^\\]\\)'''"),
- '"""': re.compile(r"(?<![^\\]\\)\"\"\""),
- '"' : re.compile(r"(?<![^\\]\\)\""),
- "'" : re.compile(r"(?<![^\\]\\)'"),
- "#": re.compile(r"$", flags=re.MULTILINE),
- }
- ALIGN_TOKEN_SEP_RE = re.compile(
- r"""
- (
- '''
- |\"\"\"
- |\"
- |'
- |\#
- |<<= |>>= |\*\*= |//=
- |\+= |\-= |\*= |/= |%= |\|= |&= |@=
- |== |!= |<= |>=
- |// |<< |>> |\^= |~=
- |(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\),
- |\->
- |, |: |=
- |\+ |\- |\* |/
- |% |\| |& |\^ |~
- |! |< |>
- |\{ |\[ |\(
- |\} |\] |\)
- |$
- )
- """,
- flags=re.MULTILINE | re.VERBOSE,
- )
- class TokenType(Enum):
- INDENT = 0
- SEPARATOR = 1
- CODE = 2
- BLOCK = 3
- NEWLINE = 4
- TokenVal = str
- class Token(NamedTuple):
- typ: TokenType
- val: TokenVal
- def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
- rest = src_contents
- prev_rest = None
- while rest:
- assert rest != prev_rest, "No progress at: " + repr(rest[:40])
- prev_rest = rest
- curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
- assert curr_token_sep is not None
- curr_token_start, curr_token_end = curr_token_sep.span()
- # newline match has zero width
- is_newline = curr_token_start == curr_token_end
- if is_newline:
- # adjust for zero width match
- curr_token_end = curr_token_start + 1
- # Get everything (if anything) up to (and excluding) the newline
- token_val = rest[:curr_token_start]
- if token_val:
- assert token_val != "\n"
- yield Token(TokenType.CODE, token_val)
- # The newline itself (note that black promises to
- # have normalized CRLF etc. to plain LF)
- token_val = rest[curr_token_start:curr_token_end]
- assert token_val == "\n"
- yield Token(TokenType.NEWLINE, token_val)
- rest = rest[curr_token_end:]
- # parse any indent
- new_rest = rest.lstrip(" \t")
- indent_len = len(rest) - len(new_rest)
- if indent_len > 0:
- indent_token_val = rest[:indent_len]
- yield Token(TokenType.INDENT, indent_token_val)
- rest = new_rest
- elif curr_token_start > 0:
- prev_token_val = rest[:curr_token_start]
- rest = rest[curr_token_start:]
- assert prev_token_val != "\n"
- assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
- yield Token(TokenType.CODE, prev_token_val)
- else:
- token_val = curr_token_sep.group(0)
- if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
- # comment, string or docstring
- block_begin_val = token_val
- assert curr_token_end > 0
- rest = rest[len(block_begin_val) :]
- end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
- block_end_match = end_matcher.search(rest)
- assert block_end_match, rest[:40]
- block_end_token = block_end_match.group(0)
- block_end_index = block_end_match.span()[-1]
- assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
- block_rest = rest[:block_end_index]
- block_token_val = block_begin_val + block_rest
- assert block_token_val.endswith(block_end_token)
- yield Token(TokenType.BLOCK, block_token_val)
- rest = rest[block_end_index:]
- else:
- sep_token_val = token_val
- yield Token(TokenType.SEPARATOR, sep_token_val)
- rest = rest[curr_token_end:]
- # NOTE (mb 2018-09-09): The way we tokenize, we always consume
- # all content belonging to strings and comments. This means that
- # the rest (after consuming all content of a string or comment),
- # should continue to be valid python. This means we can do some
- # basic sanity checks. For example, no valid python token begins
- # with a questionmark (though this is actually introduced because
- # one of the test cases conveniently has a questionmark as the
- # first character after an edge case of string parsing).
- assert not rest.startswith("?"), repr(rest)
- Indent = str
- RowIndex = int
- ColIndex = int
- OffsetWidth = int
- TokenTable = List[List[Token]]
- class RowLayoutToken(NamedTuple):
- """Disambiguate between lines with different layout/structure
- We only want to align lines which have the same structure of
- indent and separators. Any difference in the number of elements
- or type of separators causes alignment to be disabled.
- """
- typ: TokenType
- # val is only set if it should cause a different prefix
- # eg. if a separator is a comma vs a period.
- val: Optional[TokenVal]
- # Tokens which have values which are relevant to to the layout of
- # a cell group.
- LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])
- RowLayoutTokens = Tuple[RowLayoutToken, ...]
- class AlignmentContextKey(NamedTuple):
- """Does not change between multiple lines that can be aligned."""
- col_idx: ColIndex
- tok_typ: TokenType
- tok_val: TokenVal
- layout : RowLayoutTokens
- AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]
- class AlignmentCellKey(NamedTuple):
- last_row_index: RowIndex
- col_index : ColIndex
- token_val : TokenVal
- layout : RowLayoutTokens
- class AlignmentCell(NamedTuple):
- row_idx: RowIndex
- offset_width: OffsetWidth
- CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]
- def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
- for row in table:
- ctx: AlignmentContext = {}
- layout: RowLayoutTokens = tuple()
- for col_index, token in enumerate(row):
- layou_ttoken_val: Optional[TokenVal]
- if token.typ in LAYOUT_VAL_TOKENS:
- if token.val in ALIGN_BEFORE_TOKENS:
- layou_ttoken_val = token.val
- elif col_index > 0:
- # Layout tokens such as ([{ don't cause alignment, to
- # their preceding token, so line offset up to the
- # column of those tokens can a be different. We only
- # want to continue with alignment if the tokens are
- # all at the same line offset.
- layou_ttoken_val = token.val + f"::{len(row[col_index - 1].val)}"
- else:
- layou_ttoken_val = None
- else:
- layou_ttoken_val = None
- layout += (RowLayoutToken(token.typ, layou_ttoken_val),)
- if token.val in ALIGN_BEFORE_TOKENS:
- assert token.typ == TokenType.SEPARATOR
- maybe_indent_token = row[0]
- if maybe_indent_token.typ == TokenType.INDENT:
- indent = maybe_indent_token.val
- else:
- indent = ""
- prev_token = row[col_index - 1]
- if prev_token.typ == TokenType.SEPARATOR:
- continue
- offset_width = len(prev_token.val)
- ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
- ctx[ctx_key] = offset_width
- yield ctx
- def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
- cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
- for row_index, ctx in enumerate(alignment_contexts):
- ctx_items = sorted(ctx.items())
- for ctx_key, offset_width in ctx_items:
- col_index, token_typ, token_val, layout = ctx_key
- prev_cell_key = AlignmentCellKey(
- row_index - 1, col_index, token_val, layout
- )
- curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)
- curr_cell = AlignmentCell(row_index, offset_width)
- if prev_cell_key in cell_groups:
- prev_cells = cell_groups[prev_cell_key]
- del cell_groups[prev_cell_key]
- cell_groups[curr_cell_key] = prev_cells + [curr_cell]
- else:
- cell_groups[curr_cell_key] = [curr_cell]
- return cell_groups
- def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
- prev_col_index = -1
- for ctx_key, cells in sorted(cell_groups.items()):
- prev_col_index = ctx_key.col_index
- if len(cells) < 3:
- continue
- max_offset_width = max(ow for _, ow in cells)
- for row_index, offset_width in cells:
- extra_offset = max_offset_width - offset_width
- if extra_offset == 0:
- continue
- row = table[row_index]
- left_token = row[ctx_key.col_index - 1]
- maybe_number = left_token.val.strip().replace("_", "")
- if maybe_number.isdigit():
- padded_left_token_val = " " * extra_offset + left_token.val
- elif row[ctx_key.col_index + 1].typ == TokenType.NEWLINE:
- # don't align if this is the last token of the row
- continue
- else:
- padded_left_token_val = left_token.val + " " * extra_offset
- padded_token = Token(TokenType.CODE, padded_left_token_val)
- row[ctx_key.col_index - 1] = padded_token
- return "".join("".join(token.val for token in row) for row in table)
- def align_formatted_str(src_contents: str) -> FileContent:
- debug = 0
- table: TokenTable = [[]]
- for token in tokenize_for_alignment(src_contents):
- if debug:
- print("TOKEN: ", repr(token.val).ljust(50), token)
- table[-1].append(token)
- if token.typ == TokenType.NEWLINE:
- table.append([])
- else:
- assert token.typ == TokenType.BLOCK or "\n" not in token.val
- if debug:
- for row in table:
- print("ROW: ", end="")
- for tok_cell in row:
- print(tok_cell, end="\n ")
- print()
- alignment_contexts = list(find_alignment_contexts(table))
- cell_groups = find_cell_groups(alignment_contexts)
- if debug:
- for cell_key, cells in cell_groups.items():
- if len(cells) > 1:
- print("CELL", len(cells), cell_key)
- for all_cell in cells:
- print("\t\t", all_cell)
- return realigned_contents(table, cell_groups)
- print(align_formatted_str(str_contents))
Add Comment
Please, Sign In to add comment