Untitled

// -----------------------------------------

#include "mupdf/pdf.h"

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>

// value needs to be sufficently large to prevent accidental hash value matches
// default value, (1024 * 1024) is a full megabyte and should be big enough for most cases
// given a large enough PDF this could cause issues, currently unknown
#define PATH_HASH_TABLE_SIZE ( 1024 * 1024)

// -----------------------------------------

static pdf_document *doc = NULL;
static fz_context *ctx = NULL;
static fz_output *out = NULL;

static char *objHashTable = NULL;
static unsigned int objHashTableSize = 0;

static char pathHashTable[PATH_HASH_TABLE_SIZE];
static unsigned int pathCounter = 0;
static unsigned int pathHashConflicts = 0;

static short outputFlag = 0;

// -----------------------------------------

// this function takes a string and creates a hash value from it
// expectation is only PDF paths will be passed to this
unsigned int hash(const char *word)
{
    unsigned int hashval;
    for (hashval = 0; *word != '\0'; word++)
        hashval = *word + 31 * hashval;
    return hashval % PATH_HASH_TABLE_SIZE;
}

// -----------------------------------------

// this function takes a pdf object and will write all of it's types
// note that objects can be of more than one type at the same time
static void write_obj_type(fz_context *ctx, fz_output *out, pdf_obj *obj)
{
    if (!obj)
        printf("/NOT AN OBJECT");
    else {
        if (pdf_is_indirect(ctx, obj))
            printf("/INDIRECT");
        if (pdf_is_null(ctx, obj))
            printf("/NULL");
        if (pdf_is_bool(ctx, obj))
            printf("/BOOL");
        if (pdf_is_int(ctx, obj))
            printf("/INT");
        if (pdf_is_real(ctx, obj))
            printf("/REAL");
        if (pdf_is_string(ctx, obj))
            printf("/STRING");
        if (pdf_is_name(ctx, obj))
            printf("/NAME");
        if (pdf_is_array(ctx, obj))
            printf("/ARRAY");
        if (pdf_is_dict(ctx, obj))
            printf("/DICT");
        if (pdf_is_stream(ctx, obj))
            printf("/PDFSTREAM");
    }
}

// -----------------------------------------

// this function takes a path and a pdf object
// it will then check to see if this path has been added to the hash table
// if it has then it will be noted as a hash conflict
// depending on the output flag level the path will be output
// the path itself, the object type, it's hash, and if it is conflicted
// only paths that pass the hash check will be recored as a new path found
// this should in theory prevent matching paths from being recorded
static void record_path(const char* path, pdf_obj *obj)
{
    unsigned int pathHash;
    int hashConflict;
    hashConflict = 0;
    pathHash = hash(path);
    if (pathHashTable[pathHash] == 1)
    {
        hashConflict = 1;
        pathHashConflicts++;
    }
    else
    {
        pathCounter++;
        pathHashTable[pathHash] = 1;
    }
    if ( (hashConflict == 0) && (outputFlag == 1) )
    {
        printf("%s\n", path);
    }
    else if ( (hashConflict == 0) && (outputFlag == 2) )
    {
        printf("(%d) %s\n", pathCounter, path);
        printf("   TYPE = ");
        write_obj_type(ctx, out, obj);
        printf("\n");
    }
    else if (outputFlag == 3)
    {
        if (hashConflict == 0)
            printf("(%d) %s\n", pathCounter, path);
        else
            printf("(CONFLICT) %s\n", path);
        printf("   TYPE = ");
        write_obj_type(ctx, out, obj);
        printf("\n   HASH = %d\n", hash(path));
    }
}

// -----------------------------------------

// this function takes a pdf object, a string prefix, and a string name
// it will then build a new prefix, combining prefix and name, assuming name is not empty
// then will traverse all sub-objects, i.e. contained in dictionaries and arrays
// indirect references will also be traversed if they have not been already
// non-conflicting paths will be recorded as a 'good' path
static void traverse_node(pdf_obj *obj, const char* prefix, const char *name)
{
    int i, n, continue_traverse;
    char newPrefix[1024], newName[1024];
    strcpy(newPrefix, prefix);
    if ( ( strlen(name) > 0 ) &&
            ( pdf_is_indirect(ctx, obj) || pdf_is_dict(ctx, obj) || pdf_is_array(ctx, obj) ) )
    {
        strcat(newPrefix, "/");
        strcat(newPrefix, name);
        record_path(newPrefix, obj);
    }
    continue_traverse = 1;
    if (pdf_is_indirect(ctx, obj))
    {
        if (objHashTable[pdf_to_num(ctx, obj)] == 1)
            continue_traverse = 0;
    }
    if (continue_traverse == 1)
    {
        if (pdf_is_indirect(ctx, obj))
        {
            objHashTable[pdf_to_num(ctx, obj)] = 1;
        }
        if (pdf_is_dict(ctx, obj))
        {
            n = pdf_dict_len(ctx, obj);
            for (i = 0; i < n; i++)
            {
                strcpy(newName, pdf_to_name(ctx, pdf_dict_get_key(ctx, obj, i)));
                traverse_node(pdf_dict_get_val(ctx, obj, i), newPrefix, newName);
            }
        }
        if (pdf_is_array(ctx, obj))
        {
            n = pdf_array_len(ctx, obj);
            for (i = 0; i < n; i++)
            {
                strcpy(newName, pdf_to_name(ctx, pdf_array_get(ctx, obj, i)));
                traverse_node(pdf_array_get(ctx, obj, i), newPrefix, newName);
            }
        }
    }

}

// -----------------------------------------

// starting point for traversal
// clears the hash table and object reference tables
// starts traversal from root node, i.e. node 1
static void traverse_nodes_start()
{
    int i;
    if (!doc)
        fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
    pathHashConflicts = 0;
    objHashTableSize = pdf_count_objects(ctx, doc);
    objHashTable = malloc(sizeof(char) * (objHashTableSize + 1));
    for (i = 0; i < (objHashTableSize + 1); i++)
        objHashTable[i] = 0;
    for (i = 0; i < PATH_HASH_TABLE_SIZE; i++)
        pathHashTable[i] = 0;
    traverse_node(pdf_load_object(ctx, doc, 1), "", "Root");
    free(objHashTable);
}

// -----------------------------------------

// main operations function
// loads a PDF from a filename, and begins traversal
// based on outputflag will indicate information upon completion
int show_pdf_paths(const char *filename, int flag)
{
    char *password = NULL; /* don't throw errors if encrypted */
    char *output = NULL;
    outputFlag = flag;

    ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
    if (!ctx)
    {
        printf("cannot initialise context\n");
        exit(1);
    }

    if (output)
        out = fz_new_output_with_path(ctx, output, 0);
    else
        out = fz_stdout(ctx);

    fz_var(doc);
    fz_try(ctx)
    {
        doc = pdf_open_document(ctx, filename);
        if (pdf_needs_password(ctx, doc))
            if (!pdf_authenticate_password(ctx, doc, password))
                fz_warn(ctx, "cannot authenticate password: %s", filename);
        printf("Reading From PDF, Filename = %s\n", filename);
        traverse_nodes_start();
        printf("Paths Counted = %d\n", pathCounter);
        if (outputFlag == 3)
        {
            printf("Hash Size = %d\n", PATH_HASH_TABLE_SIZE);
            printf("Hash Conflicts = %d\n", pathHashConflicts);
        }
        fz_close_output(ctx, out);
    }
    fz_catch(ctx)
    {
        printf("some bad error was caught, exiting\n");
        exit(1);
    }

    fz_drop_output(ctx, out);
    pdf_drop_document(ctx, doc);
    fz_drop_context(ctx);
    return 0;
}

// -----------------------------------------

// stub main function
// gets filename and output flag from command line parameters
int main(int argc, char **argv)
{
    int flag;
    char *filename;
    if ( (argc < 2) || (argc > 3) )
        printf("missing filename parameter\n");
    else
    {
        filename = argv[1];
        flag = 0;
        if (argc == 3)
            flag = atoi(argv[2]);
        show_pdf_paths(filename, flag);
    }

}

// -----------------------------------------

// THE END