Reverse engineering Penultimate

#!/usr/bin/env python3

"""
For more details see: http://digitalinvestigation.wordpress.com/2012/12/05/reverse-engineering-evernote-penultimate-or-when-is-a-picture-not-a-picture/
"""

"""
Copyright (c) 2012, CCL Forensics
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the CCL Forensics nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL CCL FORENSICS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import sys
import struct
import xml.etree.ElementTree as etree
import os
import datetime

# ccl_bplist can be obtained from http://code.google.com/p/ccl-bplist/
import ccl_bplist

__version__ = "1.1"
__description__ = "Parses the documents in the Private Documents folder of PenUltimate"
__contact__ = "Alex Caithness"
__outputtype__ = 1
__outputext__ = None

IMAGE_DIR = "img"
DATETIME_FORMAT = "%d/%m/%Y %H:%M:%S"

EPOCH = datetime.datetime(2001, 1, 1)
def decode_cocoa_time(f):
    return EPOCH + datetime.timedelta(seconds = f)

def unpack_float_data(data):
    if len(data) % 4 != 0:
        raise ValueError("floatData is not divisible by 4 (cannot contain a number of full 32-bit floats")
    float_count = len(data) // 4
    if float_count % 3 != 0:
        raise ValueError("floatData does not contain a multiple of 3 floats")
    floats = struct.unpack("<{0}f".format(float_count), data)
    return tuple(floats[i:i+3] for i in range(0, len(floats), 3))

"""
work_input: tuple containing the path for the input folder (Private Documents)
work_output: tuple containing the folder for the output
"""
def __dowork__(work_input, work_output):
    # Unpack input
    if not isinstance(work_input, tuple) or len(work_input) < 1:
        raise ValueError("work_input must be a tuple containing the path for the input database")

    if not isinstance(work_output, tuple) or len(work_output) < 1:
        raise ValueError("work_output must be a tuple containing the path for the output")

    input_path = work_input[0]
    output_path = work_output[0]

    if not os.path.exists(input_path):
        print("ERROR: Cannot find input directory.")
        exit()
    if not os.path.exists(output_path):
        print("ERROR: Cannot find output directory.")
        exit()

    # set up output dirs
    os.mkdir(os.path.join(output_path, IMAGE_DIR))

    # set up main report
    out_tsv = open(os.path.join(output_path, "report.tsv"), "wt", encoding="utf-8")
    out_tsv.write("\t".join(["book_id", "title", "created_timestamp", "modified_timestamp", "page_count", "change_count", "creating_device_id"]) + "\n")

    # Open the notebookList file and deserialize
    notebooklist_path = os.path.join(input_path, "notebookList")
    f = open(notebooklist_path, "br")
    notebook_list_obj = ccl_bplist.deserialise_NsKeyedArchiver(ccl_bplist.load(f))
    f.close()

    # Get list of notebooks
    for notebook_details in notebook_list_obj["notebooks"]["NS.objects"]:

        title = notebook_details["title"]
        book_id =  notebook_details["name"]
        if isinstance(book_id, ccl_bplist.NsKeyedArchiverDictionary):
            # Sometimes book_id is stored as an "NS.string" rather than straight in a string
            book_id = book_id["NS.string"]
        version = "{0}.{1}".format(notebook_details["versionMajor"], notebook_details["versionMinor"])
        change_count = str(notebook_details["changeCount"])
        page_count = len(notebook_details["pageNames"]["NS.objects"])
        created_timestamp = decode_cocoa_time(notebook_details["created"]["NS.time"]).strftime(DATETIME_FORMAT)
        modified_timestamp = decode_cocoa_time(notebook_details["modified"]["NS.time"]).strftime(DATETIME_FORMAT)
        creating_device_id = notebook_details["creatingDeviceId"]

        print("Working on {0} ({1})".format(book_id, title))

        # Get the list of pages
        pages_for_output = []
        for page_details in  notebook_details["pageNames"]["NS.objects"]:
            page_id = page_details["NS.string"]
            print("\tPage ID: {0}".format(page_id))

            page_path = os.path.join(input_path, "notebooks", book_id, page_id)

            # Open file as bplist and deserialize
            notebook_f = open(page_path, "rb")
            document_obj = ccl_bplist.deserialise_NsKeyedArchiver(ccl_bplist.load(notebook_f))
            notebook_f.close()

            report_svg = etree.Element("svg",
                                        {"xmlns": "http://www.w3.org/2000/svg",
                                        "version": "1.1",})

            image_width = 0
            image_height = 0
            # Iterate the layers
            for layer_index, layer_obj in enumerate(document_obj["pageLayers"]["NS.objects"]):
                print("\t\tLayer {0}".format(layer_index))

                # get layer size
                size_string = layer_obj["size"]
                layer_width, layer_height = (int(s) for s in size_string.strip("{}").split(","))

                image_width = layer_width if image_width < layer_width else image_width
                image_height = layer_height if image_height < layer_height else image_height

                # get colour for this layer
                red = int(layer_obj["color"]["red"] * 0xFF)
                green = int(layer_obj["color"]["green"] * 0xFF)
                blue = int(layer_obj["color"]["blue"] * 0xFF)

                layer_color_code = "#{0:02x}{1:02x}{2:02x}".format(red,green,blue)

                # Iterate the layer rectangles
                for layer_rect in layer_obj["layerRects"]["NS.objects"]:
                    # Get the dimentions from the float data
                    # the data is an array of floats grouped in threes:
                    # (x, y, radius)

                    float_data = layer_rect["values"]["floatData"]
                    co_ords = unpack_float_data(float_data)

                    for x,y,d in co_ords:
                        etree.SubElement(report_svg, "circle",
                                         {"cx" : str(x),
                                          "cy" : str(layer_height - y),
                                          "r" : str(d/4),
                                          "fill" : layer_color_code })

            # set viewBox, height and width
            report_svg.set("viewBox", "0 0 {0} {1}".format(image_width, image_height))
            report_svg.set("height", str(image_height))
            report_svg.set("width", str(image_width))

            # write out image
            image_output_filename = "{0}_{1}.svg".format(book_id, page_id)
            image_output_path = os.path.join(output_path, IMAGE_DIR, image_output_filename)
            print("\t\tWriting to '{0}'".format(image_output_path))
            with open(image_output_path, "wt", encoding="utf-8") as svg_out:
                svg_out.write("<!DOCTYPE svg>")
                svg_out.write(etree.tostring(report_svg, "utf-8", method="html").decode("utf-8"))
            print("\t\tImage written")

        out_tsv.write("\t".join([book_id, title, created_timestamp, modified_timestamp, str(page_count), str(change_count), creating_device_id]) + "\n")

    out_tsv.close()

    print("Done!")

def __main__():
    if len(sys.argv) < 3:
        print()
        print("Usage: {0} <Private Documents dir> <output dir>".format(os.path.basename(sys.argv[0])))
        print()
        sys.exit(1)
    else:
        work_input = (sys.argv[1],)
        work_output = (sys.argv[2],)
        __dowork__(work_input, work_output)

if __name__ == "__main__":
    __main__()