program2.py


from __future__ import print_function

import base64
import datetime
import sys

# Reverse Caesar shift the alphabetic characters in 'line' by 'amt' places.
def outer_Caesar_shift(line, amt):
    answer = ""
    for c in line:
        if (ord('A') <= ord(c)) and (ord(c) <= ord('Z')):
            answer += chr(((ord(c) - ord('A') - amt) % 26) + ord('A'))
        elif (ord('a') <= ord(c)) and (ord(c) <= ord('z')):
            answer += chr(((ord(c) - ord('a') - amt) % 26) + ord('a'))
        else:
            answer += c
    return answer

# Reverse the bits in a byte.
def reverse_byte(b):
    return int("{:08b}".format(b)[::-1], 2)

# Convert the external coded message text to a list of correctly flipped bytes.
# This means that for New messages, the even-numbered bytes are reversed.
# Arguments:
#    'clue' is "O" for Old messages, "N" for New messages,
#           "+" for New messages that need to be extended by a zero byte,
#           and "?" for unknown message type, i.e., the sidebar.
#    'stamplast' is last character of the Unix timestamp, to use for
#                the reverse Caesar shift.
#    'stuff' is the external coded message text.
def extract_ordered_inside_bytes(clue, stamplast, stuff):
    shifted_stuff = outer_Caesar_shift(stuff, ord(stamplast) - ord('0'))
    if (len(shifted_stuff) % 4) != 0:
        shifted_stuff += "".join(["=" for _1 in range((-len(shifted_stuff)) % 4)])
    decimal_stuff = base64.b64decode(shifted_stuff)
    message_value = int(decimal_stuff.decode("ascii"))
    byte_string = "{:b}".format(message_value)
    pad_to_whole_byte = "".join(["0" for _1 in range((-len(byte_string)) % 8)])
    byte_string = pad_to_whole_byte + byte_string
    if clue == "+":
        byte_string = "00000000" + byte_string
    byte_list = []
    for i in range(0,len(byte_string),8):
        current_byte = byte_string[i:i+8]
        if (clue in ("N", "+")) and ((i % 16) == 8):
            current_byte = current_byte[::-1]
        byte_list.append(int(current_byte, 2))
    return byte_list

# Print out a byte list of the message in the standard eight columns.
# Note: I number my columns like I number my bytes: Right to left, and starting from 0.
def print_byte_array(byte_list):
    pad_to_64_bits = [None for _1 in range((-len(byte_list)) % 8)]
    byte_list = pad_to_64_bits + byte_list
    column = 8
    for current_byte in byte_list:
        column = (column - 1) % 8
        num_spaces = [1,2,1,3,1,2,1,4][column]
        print("".join([" " for _1 in range(num_spaces)]), end="")
        if current_byte == None:
            print("        ", end="")
        else:
            print("{:08b}".format(current_byte).replace("0","-"), end="")
        if column == 0:
            print()

# Print out the column headers corresponding to printing out a byte
# list where the bytes have already been flipped into 70615243 order.
def print_flipped_array_header(clue, length_byte_list):
    if clue in ("N", "+"):
        if (length_byte_list % 2) == 0:
            print("    FORWARD  REVERSED  FORWARD  REVERSED   FORWARD  REVERSED  FORWARD  REVERSED")
        else:
            print("    REVERSED FORWARD   REVERSED FORWARD    REVERSED FORWARD   REVERSED FORWARD ")
    else:
        print("    FORWARD  FORWARD   FORWARD  FORWARD    FORWARD  FORWARD   FORWARD  FORWARD ")
    print("    70615243 70615243  70615243 70615243   70615243 70615243  70615243 70615243")

# Print out the XOR mask template corresponding to the message.
def print_flipped_array_footer(clue, length_byte_list):
    if clue in ("N", "+"):
        print("    ijklmnop abcdefgh  IJKLMNOP ABCDEFGH   ABCDEFGH IJKLMNOP  abcdefgh ijklmnop")
    elif clue in ("O",):
        print("    ABCDEFGH IJKLMNOP  abcdefgh ijklmnop   ABCDEFGH IJKLMNOP  abcdefgh ijklmnop")

# Print out the XOR mask bit values corresponding to the message.
def print_unmasked_array_header(clue, mask):
    print("    UNMASKED UNMASKED  UNMASKED UNMASKED   UNMASKED UNMASKED  UNMASKED UNMASKED")
    if clue in ("N", "+"):
        print("    {0:08b} {1:08b}  {2:08b} {3:08b}   {3:08b} {2:08b}  {1:08b} {0:08b}".format(*mask))
    elif clue in ("O",):
        print("    {3:08b} {2:08b}  {1:08b} {0:08b}   {3:08b} {2:08b}  {1:08b} {0:08b}".format(*mask))

# Apply a given mask against the byte list.
def apply_mask(clue, mask, byte_list):
    unmasked_byte_list = []
    column = len(byte_list) % 8
    for current_byte in byte_list:
        column = (column - 1) % 8
        if clue in ("N", "+"):
            mask_column = min(column, 7-column)
        else:
            mask_column = column % 4
        unmasked_byte_list.append(current_byte ^ mask[mask_column])
    return unmasked_byte_list

# Print out a byte by byte detailed listing of the decoding.
# If 'only' is specified, then just list the bytes for that mask value.
def print_byte_by_byte_detail(clue, mask, byte_list, only=None):
    print("    ORIGINAL  COL 70615243  MC MASKBYTE UNMASKED  PERMUTED CHARACTER")

    column = len(byte_list) % 8
    reverse_flag = True
    for current_byte in byte_list:
        reverse_flag = not reverse_flag
        column = (column - 1) % 8
        if clue in ("N", "+"):
            mask_column = min(column, 7-column)
        else:
            mask_column = column % 4

        if reverse_flag and (clue in ("N", "+")):
            original_byte = reverse_byte(current_byte)
            reverse_symbol = "-"
        else:
            original_byte = current_byte
            reverse_symbol = "+"

        mask_value = mask[mask_column]
        unmasked_byte = mask_value ^ current_byte

        ustr = "{:08b}".format(unmasked_byte)
        pstr = ustr[0] + ustr[2] + ustr[4] + ustr[6] + ustr[7] + ustr[5] + ustr[3] + ustr[1]
        permuted_byte = int(pstr, 2)

        if permuted_byte == 0x00:
            ch = "NUL"
        elif permuted_byte < 0x20:
            ch = "..."
        elif permuted_byte < 0x7F:
            ch = chr(permuted_byte)
        elif permuted_byte == 0x7F:
            ch = "DEL"
        else:
            ch = "***BAD***"

        if (only == None) and (column == 7):
            print()

        if (only == None) or (only == mask_column):
            print("    {0}  {1}{2}  {3}  {4}  {5} {6}  {7} 0x{8:02X}  {9}".format(
                  "{:08b}".format(original_byte).replace("0","-"),
                  column,
                  reverse_symbol,
                  "{:08b}".format(current_byte).replace("0","-"),
                  mask_column,
                  "{:08b}".format(mask_value),
                  "{:08b}".format(unmasked_byte).replace("0","-"),
                  "{:08b}".format(permuted_byte).replace("0","-"),
                  permuted_byte,
                  ch
            ))

# Perform a special bit by bit analysis of the 0 timestamp message.
def zero_statistical_analysis(clue, byte_list):
    bit_bias = [0 for _1 in range(8)]
    for current_byte in byte_list:
        for i in range(8):
            if (current_byte & (1 << i)) != 0:
                bit_bias[i] += 1
    print()
    print("-------------------- BEGIN SPECIAL 0 TIMESTAMP MESSAGE ANALYSIS --------------------")
    for i in range(8):
        print("    Bit", i, "bias: ", bit_bias[i] / len(byte_list))
    print("-------------------- END SPECIAL 0 TIMESTAMP MESSAGE ANALYSIS ----------------------")
    return

# Guess the XOR mask using a quick and dirty bit frequency analysis.
def quick_statistical_analysis(clue, byte_list):
    counts = [0 for _1 in range(4)]
    bit_frequency = [[0 for _2 in range(8)] for _1 in range(4)]
    column = len(byte_list) % 8
    for current_byte in byte_list:
        column = (column - 1) % 8
        if clue in ("N", "+"):
            mask_column = min(column, 7-column)
        else:
            mask_column = column % 4
        counts[mask_column] += 1
        for i in range(8):
            if (current_byte & (1 << i)) != 0:
                bit_frequency[mask_column][i] += 1
    target = [0, 0, 0, 1, 0, 1, 1, 0]
    mask = [0 for _1 in range(4)]
    for mask_column in range(4):
        for i in range(8):
            bias = (2 * bit_frequency[mask_column][i]) > counts[mask_column]
            if (1 if bias else 0) != target[i]:
                mask[mask_column] += (1 << i)
    return mask

# Process a single coded message.
def process_one(index, clue, stamp, posttime, where, byte_list):
    print()
    message_id = "[{:s}] {:s}".format(index, stamp)
    print("[{:s}] {:s} {:s}:".format(index, where, stamp))

    print()
    if stamp[0] in "0123456789":
        if stamp != "0000000000":
            stamp_human = datetime.datetime.fromtimestamp(int(stamp)).strftime('%Y-%m-%d %H:%M:%S')
        else:
            stamp_human = "1970-01-01 00:00:00"
        print("    Unix timestamp decode:  {:s}".format(stamp_human))
    else:
        print("    Unix timestamp is missing.")
    print("    Reddit posting time:    {:s}".format(posttime.replace("_",":")))

    category = ""
    if clue in ("N", "+"):
        print("    Message type is NEW.", end="")
        category += "NEW"
    elif clue in ("O",):
        print("    Message type is OLD.", end="")
        category += "OLD"
    else:
        print("    Message type is UNKNOWN.", end="")
        category += "UNK"
    if (len(byte_list) % 2) == 0:
        print("  Message size is EVEN.", end="")
        category += " EVEN"
    else:
        print("  Message size is ODD.", end="")
        category += " ODD "
    print("  Message length is {:d} bytes.".format(len(byte_list)))
    category += " {:3d}".format(len(byte_list))

    print()
    print_flipped_array_header(clue, len(byte_list))
    print_byte_array(byte_list)
    print_flipped_array_footer(clue, len(byte_list))

    if clue == "?":
        print()
        return

    if stamp == "0000000000":
        zero_statistical_analysis(clue, byte_list)

    quick_mask = quick_statistical_analysis(clue, byte_list)
    unmasked_byte_list = apply_mask(clue, quick_mask, byte_list)

    print()
    print()
    print("    Using quick-and-dirty statistical mask guess.")
    print()
    print_unmasked_array_header(clue, quick_mask)
    print_byte_array(unmasked_byte_list)
    print()
    print()
    print_byte_by_byte_detail(clue, quick_mask, byte_list)

    if stamp[0] not in "0123456789":
        print()
        return

    stamp_binary = "{:032b}".format(int(stamp))

    mask = quick_mask
    mask_binary = "{:08b}{:08b}{:08b}{:08b}".format(mask[3], mask[2], mask[1], mask[0])
    mask_7_bits = "{:s} {:s} {:s} {:s}".format(mask_binary[0], mask_binary[8], mask_binary[16], mask_binary[24])
    print()
    print("    <<GREPME001>>  ", message_id, category, " ", stamp_binary, "", mask_7_bits)
    print("    <<GREPME002>>  ", message_id, category, " ", stamp_binary, "", mask_binary)

    print()
    return

# Process all the coded messages in the f04cb dataset.
def process_all():
    datafile = open("dataset_for_solving_f04cb.txt", "r")
    for oneline in datafile:
        if oneline[-1] == '\n':
            oneline = oneline[:-1]
        index, clue, stamp, posttime, where, stuff = oneline.split(":")
        if clue != "X":
            byte_list = extract_ordered_inside_bytes(clue, stamp[-1], stuff)
            # Edit the following if-test to only process some of the messages.
            if True:
                process_one(index, clue, stamp, posttime, where, byte_list)


if __name__ == "__main__":
    process_all()