Guest User

Untitled

a guest
Sep 10th, 2018
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.50 KB | None | 0 0
  1. # TODO (mb 2018-09-10): don't align last element on line if
  2. # followed by a comment
  3. # TODO (mb 2018-09-09): respect fmt: off, fmt: on
  4. # TODO (mb 2018-09-08): replace in python context r"\[\"\w[\w\d\-]*\"\]"
  5. # ie. mydict["mykey"] -> mydict['mykey']
  6. # TODO (mb 2018-09-08): replace in dict context r"\[\"\w[\w\d\-]*\]\":"
  7. # ie. "mykey": "myval", -> 'mykey': "myval",
  8.  
  9.  
  10. import re
  11. from enum import Enum
  12. from typing import *
  13.  
  14. FileContent = str
  15.  
  16.  
  17. str_contents = """
  18. #!/usr/bin/env python3
  19. # fmt: on
  20. # Some license here.
  21. #
  22. # Has many lines. Many, many lines.
  23. # Many, many, many lines.
  24. \"\"\"Module docstring.
  25.  
  26. Possibly also many, many lines.
  27. \"\"\"
  28.  
  29.  
  30. import os.path
  31. import sys
  32.  
  33. import a
  34. from b.c import X # some noqa comment
  35.  
  36.  
  37. def test_parse():
  38. \"\"\"Docstring comes first.
  39.  
  40. Possibly many lines.
  41. \"\"\"
  42. # FIXME: Some comment about why this function is crap but still in production.
  43.  
  44. environ = {
  45. "MYAPP_DB_HOST": "1.2.3.4",
  46. "MYAPP_DB_PORT": "1234",
  47. 'MYAPP_DB_PASSWORD': "secret",
  48. 'MYAPP_DB_READ_ONLY': "0",
  49. 'MYAPP_DB_DDL': "~/mkdb.sql",
  50. 'MYAPP_DL': 123_123,
  51. 'MYAPP_DL': 123_123_929,
  52. 'MYAPP_DBDL': 12,
  53. }
  54.  
  55. barf = {
  56. 22: 23_222,
  57. 2234: 231_231_231_232,
  58. 1234: 231_231_232,
  59. }
  60.  
  61. dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")
  62.  
  63. assert dbenv.host == "1.2.3.4"
  64. assert dbenv.user == "new_user"
  65. assert dbenv.password == "secret"
  66.  
  67. assert dbenv.read_only is False
  68. assert isinstance(dbenv.ddl, pl.Path)
  69. assert str(dbenv.ddl).endswith("mkdb.sql")
  70.  
  71. assert len(attrnames) == 7
  72.  
  73.  
  74. GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}
  75.  
  76. 'What\\'s the deal "here"?'
  77. 'And "here"?'
  78.  
  79. StrList = List[str]
  80. PathList = List[Path]
  81.  
  82. class TestEnv(myenv.BaseEnv):
  83. '''
  84. notakey: notaval
  85. notakeyeither: notaval
  86. '''
  87.  
  88. str_val_wo_default: str
  89. float_val_wo_default: float
  90. path_val_wo_default: pl.Path
  91. paths_val_wo_default: List[pl.Path]
  92.  
  93. str_val: str = "foo"
  94. strs_val: StrList = ["foo = bar", "barfoo = baz"]
  95. float_val: float = 12.34
  96. path_val: pl.Path = pl.Path("file.txt")
  97. paths_val: PathList = [
  98. [1, 2, 3, 4],
  99. [5, 6, 7, 8],
  100. [1, 22, 2243, 4],
  101.  
  102. {23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
  103. {1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
  104. {1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
  105. ]
  106. \"\"\"Docstring for instance attribute spam.\"\"\"
  107. """
  108.  
  109. # str_contents = open("blacker.py").read()
  110. # str_contents = open("test.txt").read()
  111.  
  112.  
  113. # fmt: off
  114. ALIGN_BEFORE_TOKENS = {
  115. "<<=", ">>=", "**=", "//=",
  116. "+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=",
  117. "==", "!=", "<=", ">=",
  118. "//", "<<", ">>", "^=", "~=",
  119. "in", "is", "},", "],", "),",
  120. "->",
  121. ",", ":", "=",
  122. "+", "-", "*", "/",
  123. "%", "|", "&", "^", "~",
  124. "!", "<", ">",
  125. "}", "]", ")",
  126. }
  127. # fmt: on
  128.  
  129.  
  130. NO_ALIGN_BLOCK_END_MATCHERS = {
  131. "'''": re.compile(r"(?<![^\\]\\)'''"),
  132. '"""': re.compile(r"(?<![^\\]\\)\"\"\""),
  133. '"' : re.compile(r"(?<![^\\]\\)\""),
  134. "'" : re.compile(r"(?<![^\\]\\)'"),
  135. "#": re.compile(r"$", flags=re.MULTILINE),
  136. }
  137.  
  138.  
  139. ALIGN_TOKEN_SEP_RE = re.compile(
  140. r"""
  141. (
  142. '''
  143. |\"\"\"
  144. |\"
  145. |'
  146. |\#
  147.  
  148. |<<= |>>= |\*\*= |//=
  149. |\+= |\-= |\*= |/= |%= |\|= |&= |@=
  150. |== |!= |<= |>=
  151. |// |<< |>> |\^= |~=
  152. |(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\),
  153. |\->
  154. |, |: |=
  155. |\+ |\- |\* |/
  156. |% |\| |& |\^ |~
  157. |! |< |>
  158. |\{ |\[ |\(
  159. |\} |\] |\)
  160.  
  161. |$
  162. )
  163. """,
  164. flags=re.MULTILINE | re.VERBOSE,
  165. )
  166.  
  167.  
  168. class TokenType(Enum):
  169.  
  170. INDENT = 0
  171. SEPARATOR = 1
  172. CODE = 2
  173. BLOCK = 3
  174. NEWLINE = 4
  175.  
  176.  
  177. TokenVal = str
  178.  
  179.  
  180. class Token(NamedTuple):
  181.  
  182. typ: TokenType
  183. val: TokenVal
  184.  
  185.  
  186. def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
  187. rest = src_contents
  188. prev_rest = None
  189.  
  190. while rest:
  191. assert rest != prev_rest, "No progress at: " + repr(rest[:40])
  192. prev_rest = rest
  193.  
  194. curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
  195. assert curr_token_sep is not None
  196. curr_token_start, curr_token_end = curr_token_sep.span()
  197.  
  198. # newline match has zero width
  199. is_newline = curr_token_start == curr_token_end
  200. if is_newline:
  201. # adjust for zero width match
  202. curr_token_end = curr_token_start + 1
  203.  
  204. # Get everything (if anything) up to (and excluding) the newline
  205. token_val = rest[:curr_token_start]
  206. if token_val:
  207. assert token_val != "\n"
  208. yield Token(TokenType.CODE, token_val)
  209.  
  210. # The newline itself (note that black promises to
  211. # have normalized CRLF etc. to plain LF)
  212. token_val = rest[curr_token_start:curr_token_end]
  213. assert token_val == "\n"
  214. yield Token(TokenType.NEWLINE, token_val)
  215.  
  216. rest = rest[curr_token_end:]
  217. # parse any indent
  218. new_rest = rest.lstrip(" \t")
  219. indent_len = len(rest) - len(new_rest)
  220. if indent_len > 0:
  221. indent_token_val = rest[:indent_len]
  222. yield Token(TokenType.INDENT, indent_token_val)
  223. rest = new_rest
  224. elif curr_token_start > 0:
  225. prev_token_val = rest[:curr_token_start]
  226. rest = rest[curr_token_start:]
  227. assert prev_token_val != "\n"
  228. assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
  229. yield Token(TokenType.CODE, prev_token_val)
  230. else:
  231. token_val = curr_token_sep.group(0)
  232. if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
  233. # comment, string or docstring
  234. block_begin_val = token_val
  235. assert curr_token_end > 0
  236. rest = rest[len(block_begin_val) :]
  237. end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
  238. block_end_match = end_matcher.search(rest)
  239. assert block_end_match, rest[:40]
  240. block_end_token = block_end_match.group(0)
  241. block_end_index = block_end_match.span()[-1]
  242. assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
  243. block_rest = rest[:block_end_index]
  244. block_token_val = block_begin_val + block_rest
  245. assert block_token_val.endswith(block_end_token)
  246. yield Token(TokenType.BLOCK, block_token_val)
  247. rest = rest[block_end_index:]
  248. else:
  249. sep_token_val = token_val
  250. yield Token(TokenType.SEPARATOR, sep_token_val)
  251. rest = rest[curr_token_end:]
  252.  
  253. # NOTE (mb 2018-09-09): The way we tokenize, we always consume
  254. # all content belonging to strings and comments. This means that
  255. # the rest (after consuming all content of a string or comment),
  256. # should continue to be valid python. This means we can do some
  257. # basic sanity checks. For example, no valid python token begins
  258. # with a questionmark (though this is actually introduced because
  259. # one of the test cases conveniently has a questionmark as the
  260. # first character after an edge case of string parsing).
  261. assert not rest.startswith("?"), repr(rest)
  262.  
  263.  
  264. Indent = str
  265. RowIndex = int
  266. ColIndex = int
  267. OffsetWidth = int
  268.  
  269. TokenTable = List[List[Token]]
  270.  
  271.  
  272. class RowLayoutToken(NamedTuple):
  273. """Disambiguate between lines with different layout/structure
  274.  
  275. We only want to align lines which have the same structure of
  276. indent and separators. Any difference in the number of elements
  277. or type of separators causes alignment to be disabled.
  278. """
  279.  
  280. typ: TokenType
  281. # val is only set if it should cause a different prefix
  282. # eg. if a separator is a comma vs a period.
  283. val: Optional[TokenVal]
  284.  
  285.  
  286. # Tokens which have values which are relevant to to the layout of
  287. # a cell group.
  288. LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])
  289.  
  290. RowLayoutTokens = Tuple[RowLayoutToken, ...]
  291.  
  292.  
  293. class AlignmentContextKey(NamedTuple):
  294. """Does not change between multiple lines that can be aligned."""
  295.  
  296. col_idx: ColIndex
  297. tok_typ: TokenType
  298. tok_val: TokenVal
  299. layout : RowLayoutTokens
  300.  
  301.  
  302. AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]
  303.  
  304.  
  305. class AlignmentCellKey(NamedTuple):
  306. last_row_index: RowIndex
  307. col_index : ColIndex
  308. token_val : TokenVal
  309. layout : RowLayoutTokens
  310.  
  311.  
  312. class AlignmentCell(NamedTuple):
  313. row_idx: RowIndex
  314. offset_width: OffsetWidth
  315.  
  316.  
  317. CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]
  318.  
  319.  
  320. def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
  321. for row in table:
  322. ctx: AlignmentContext = {}
  323. layout: RowLayoutTokens = tuple()
  324.  
  325. for col_index, token in enumerate(row):
  326. layou_ttoken_val: Optional[TokenVal]
  327. if token.typ in LAYOUT_VAL_TOKENS:
  328. if token.val in ALIGN_BEFORE_TOKENS:
  329. layou_ttoken_val = token.val
  330. elif col_index > 0:
  331. # Layout tokens such as ([{ don't cause alignment, to
  332. # their preceding token, so line offset up to the
  333. # column of those tokens can a be different. We only
  334. # want to continue with alignment if the tokens are
  335. # all at the same line offset.
  336. layou_ttoken_val = token.val + f"::{len(row[col_index - 1].val)}"
  337. else:
  338. layou_ttoken_val = None
  339. else:
  340. layou_ttoken_val = None
  341.  
  342. layout += (RowLayoutToken(token.typ, layou_ttoken_val),)
  343.  
  344. if token.val in ALIGN_BEFORE_TOKENS:
  345. assert token.typ == TokenType.SEPARATOR
  346. maybe_indent_token = row[0]
  347. if maybe_indent_token.typ == TokenType.INDENT:
  348. indent = maybe_indent_token.val
  349. else:
  350. indent = ""
  351. prev_token = row[col_index - 1]
  352. if prev_token.typ == TokenType.SEPARATOR:
  353. continue
  354.  
  355. offset_width = len(prev_token.val)
  356. ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
  357. ctx[ctx_key] = offset_width
  358.  
  359. yield ctx
  360.  
  361.  
  362. def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
  363. cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
  364. for row_index, ctx in enumerate(alignment_contexts):
  365. ctx_items = sorted(ctx.items())
  366. for ctx_key, offset_width in ctx_items:
  367. col_index, token_typ, token_val, layout = ctx_key
  368. prev_cell_key = AlignmentCellKey(
  369. row_index - 1, col_index, token_val, layout
  370. )
  371. curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)
  372.  
  373. curr_cell = AlignmentCell(row_index, offset_width)
  374.  
  375. if prev_cell_key in cell_groups:
  376. prev_cells = cell_groups[prev_cell_key]
  377. del cell_groups[prev_cell_key]
  378. cell_groups[curr_cell_key] = prev_cells + [curr_cell]
  379. else:
  380. cell_groups[curr_cell_key] = [curr_cell]
  381.  
  382. return cell_groups
  383.  
  384.  
  385. def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
  386. prev_col_index = -1
  387. for ctx_key, cells in sorted(cell_groups.items()):
  388. prev_col_index = ctx_key.col_index
  389. if len(cells) < 3:
  390. continue
  391.  
  392. max_offset_width = max(ow for _, ow in cells)
  393. for row_index, offset_width in cells:
  394. extra_offset = max_offset_width - offset_width
  395. if extra_offset == 0:
  396. continue
  397.  
  398. row = table[row_index]
  399. left_token = row[ctx_key.col_index - 1]
  400. maybe_number = left_token.val.strip().replace("_", "")
  401. if maybe_number.isdigit():
  402. padded_left_token_val = " " * extra_offset + left_token.val
  403. elif row[ctx_key.col_index + 1].typ == TokenType.NEWLINE:
  404. # don't align if this is the last token of the row
  405. continue
  406. else:
  407. padded_left_token_val = left_token.val + " " * extra_offset
  408. padded_token = Token(TokenType.CODE, padded_left_token_val)
  409. row[ctx_key.col_index - 1] = padded_token
  410.  
  411. return "".join("".join(token.val for token in row) for row in table)
  412.  
  413.  
  414. def align_formatted_str(src_contents: str) -> FileContent:
  415. debug = 0
  416.  
  417. table: TokenTable = [[]]
  418. for token in tokenize_for_alignment(src_contents):
  419. if debug:
  420. print("TOKEN: ", repr(token.val).ljust(50), token)
  421. table[-1].append(token)
  422. if token.typ == TokenType.NEWLINE:
  423. table.append([])
  424. else:
  425. assert token.typ == TokenType.BLOCK or "\n" not in token.val
  426.  
  427. if debug:
  428. for row in table:
  429. print("ROW: ", end="")
  430. for tok_cell in row:
  431. print(tok_cell, end="\n ")
  432. print()
  433.  
  434. alignment_contexts = list(find_alignment_contexts(table))
  435. cell_groups = find_cell_groups(alignment_contexts)
  436.  
  437. if debug:
  438. for cell_key, cells in cell_groups.items():
  439. if len(cells) > 1:
  440. print("CELL", len(cells), cell_key)
  441. for all_cell in cells:
  442. print("\t\t", all_cell)
  443.  
  444. return realigned_contents(table, cell_groups)
  445.  
  446.  
  447. print(align_formatted_str(str_contents))
Add Comment
Please, Sign In to add comment