Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Rules:
- Cycle may contain any number of independent:
- * Virtual (takes no time but affects dependecy graph)
- alloca
- * Loads. So can't group any of:
- %arrayidx = getelementptr inbounds i8*, i8** %argv, i32 %i
- %0 = load i8*, i8** %arrayidx
- %1 = load i8, i8* %0
- * Compute.
- binops: .add, .sub, .mul, .div, .rem, *sh*, and, or, xor
- other: getelementptr
- cross-phase: select (implies pick), br-cmp (implies compute and branch)
- Example Group:
- %conv26.i = sext i8 %9 to i32
- %cmp27.i = icmp sgt i8 %9, 57
- But not:
- %sub31.sub.v.i = select i1 %cmp27.i
- * Conversion
- ie: *trunc, *ext, *to*, *cast,
- * Pick
- vector: [insert|extractelement], shufflevector
- * Store:
- store i64 %4, i64* %5
- store %struct.list_head_s* %incdec.ptr, %struct.list_head_s** %next
- * Branch
- terminators: ret, br, switch
- (indirectbr, invoke, resume, catchswitch, catchret, cleanupret, unreachable - not present in benchmark)
- other: call,
- Store may depend on Compute and Load. Compute may depend on Load. Due to phased
- execution. Also, Branch may depend on anything.
- There's some ways Compute can depend on Compute, like fused-multiply-add. But
- it's complitated, needs reordering. Never-mind for now.
- [exract|insert]value aren't in benchmark, luckily; No idea how to handle
- alloca barely appears.
- fence does not appear, and Mill is fence-free besides
- cmpxchg, atomic* does not appear - would be complicated
- phi - DNA
- call malloc, putchar, etc - few in benchmark, treat as branch
- va_arg,
- ignore multicycle effects (add|div) - assume magic scheduler
- TODO: histogram instruction categories
- loop unrolling?
- reorder, hoist?
- work backwords to spec #pipes, FUs, etc
- PGO the code to remove unmodeled branch-prediction effects
- Handle switch fairly for Mill and Intel -- one-line switch reduces input program by 10%
- Features:
- * AIR - Apparently Infinite Registers
- * PEX - Phased Execution
- * EFU - Enough Functional Units
- * EDW - Enough Decode bandWidth
- Notes:
- * If no instructions fuse, IPC=1 by definition
- * Switches are single-cycle -- EFU
- Algorithm:
- * Load instruction list
- * Reverse the list
- * For each instruction, get operation, phase, operands, result
- * If any operand appears in the result-list for this phase or a subsequent phase emit the old and start a new fused instruction
- * Otherwise, add this instruction to the old fused instruction
- Output is a reversed list of fused instructions
- IPC = InstructionsIn / FusedInstructionsOut
- Fans guess Mill will have IPC>10. Skepics guess IPC>2 is hard; IPC>3 is
- impossible. Hoping to dissapoint everyone.
- """
- from sys import stdin
- from re import *
- from os import abort
- match_instr = compile(r"\s*(%\S+)?\s*=?\s*(\S+)\s*(.*)").match
- findall_operands = compile(r"(%[^, \(\)\[\]]+)").findall
- # PHASES
- LOAD = 2
- COMPUTE = 3
- CONVERT = 4
- PICK = 5
- STORE = 6
- BRANCH = 7
- # FINAL phase for sizing tables, not a real phase
- FINAL = 7
- # Alloca, not a real instruction
- VIRTUAL = 1
- def M(op):
- return compile(op).match
- # TODO: special case branch: goes in COMPUTE phase if conditional, BRANCH phase if unconditional
- op_phase = { M('alloca'): VIRTUAL,
- M('load'): LOAD,
- M('store'): STORE,
- M('insertelement'): STORE,
- M('.*(add|sub|mul|div|rem|sh|and|or|xor|cmp).*'): COMPUTE,
- # switch is COMPUTE because its a mix of COMPUTE and BRANCH and COMPUTE is earlier than BRANCH
- M('switch'): COMPUTE,
- M('getelementptr'): COMPUTE,
- M('select'): COMPUTE,
- M('.*(trunc|ext|to|cast).*'): CONVERT,
- M('br'): BRANCH,
- M('call|ret|tail'): BRANCH
- }
- def which_phase(op, operands):
- """returns phase for op
- Examines operands for branch; goes in COMPUTE phase when
- conditional"""
- if op == 'br':
- if(len(operands) > 1):
- return COMPUTE
- for matches, phs in op_phase.iteritems():
- if matches(op):
- return phs
- # XXX raise
- print "ERROR: which_phase", op, operands
- exit(1)
- def parse(inst):
- """returns (op, phase, operands, result)
- result may be None
- """
- res = match_instr(inst)
- if res is not None:
- (result, op, tail) = res.groups()
- else:
- print "parse('%s') failed" % inst
- exit(1)
- operands = findall_operands(tail)
- phase = which_phase(op, operands)
- return (op, phase, operands, result)
- split_instructions = compile(r"\s*About to interpret:\s+",
- MULTILINE | DOTALL).split
- def join_lines(ls):
- return ls.replace("\n", "")
- match_blank = compile(r"^\s*$").match
- def not_blank(s):
- return match_blank(s) is None
- def conflicts(phops, phase, result):
- """return True when the result is an operand of this or any previous
- phase"""
- for phs in range(phase + 1):
- if result in phops[phs]:
- # print "CONFLICT: ", result, "used in phase", phs, "supplied in phase", phase
- return True
- return False
- def empty_phased_operands():
- r = []
- for i in range(FINAL + 1):
- r.append(set())
- return r
- # Read all input, split by instruction and reverse
- # (Most instructions are 1-per line, but switch is multi-line, damnit)
- def go():
- insts = reversed(
- filter(not_blank,
- map(join_lines,
- split_instructions(stdin.read()))))
- fused = []
- # phased_operands[phase] = [operands]
- phased_operands = empty_phased_operands()
- for inst in insts:
- # print inst
- op, phase, operands, result = parse(inst)
- if conflicts(phased_operands, phase, result):
- print len(fused), result, fused
- fused = []
- phased_operands = empty_phased_operands()
- fused.append(inst)
- phased_operands[phase] |= set(operands)
- # print "PHASE", phase, "OPNDS", phased_operands
- # Last instruction out
- if fused:
- print fused
- go()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement