Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // -----------------------------------------
- #include "mupdf/pdf.h"
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <limits.h>
- // value needs to be sufficently large to prevent accidental hash value matches
- // default value, (1024 * 1024) is a full megabyte and should be big enough for most cases
- // given a large enough PDF this could cause issues, currently unknown
- #define PATH_HASH_TABLE_SIZE ( 1024 * 1024)
- // -----------------------------------------
- static pdf_document *doc = NULL;
- static fz_context *ctx = NULL;
- static fz_output *out = NULL;
- static char *objHashTable = NULL;
- static unsigned int objHashTableSize = 0;
- static char pathHashTable[PATH_HASH_TABLE_SIZE];
- static unsigned int pathCounter = 0;
- static unsigned int pathHashConflicts = 0;
- static short outputFlag = 0;
- // -----------------------------------------
- // this function takes a string and creates a hash value from it
- // expectation is only PDF paths will be passed to this
- unsigned int hash(const char *word)
- {
- unsigned int hashval;
- for (hashval = 0; *word != '\0'; word++)
- hashval = *word + 31 * hashval;
- return hashval % PATH_HASH_TABLE_SIZE;
- }
- // -----------------------------------------
- // this function takes a pdf object and will write all of it's types
- // note that objects can be of more than one type at the same time
- static void write_obj_type(fz_context *ctx, fz_output *out, pdf_obj *obj)
- {
- if (!obj)
- printf("/NOT AN OBJECT");
- else {
- if (pdf_is_indirect(ctx, obj))
- printf("/INDIRECT");
- if (pdf_is_null(ctx, obj))
- printf("/NULL");
- if (pdf_is_bool(ctx, obj))
- printf("/BOOL");
- if (pdf_is_int(ctx, obj))
- printf("/INT");
- if (pdf_is_real(ctx, obj))
- printf("/REAL");
- if (pdf_is_string(ctx, obj))
- printf("/STRING");
- if (pdf_is_name(ctx, obj))
- printf("/NAME");
- if (pdf_is_array(ctx, obj))
- printf("/ARRAY");
- if (pdf_is_dict(ctx, obj))
- printf("/DICT");
- if (pdf_is_stream(ctx, obj))
- printf("/PDFSTREAM");
- }
- }
- // -----------------------------------------
- // this function takes a path and a pdf object
- // it will then check to see if this path has been added to the hash table
- // if it has then it will be noted as a hash conflict
- // depending on the output flag level the path will be output
- // the path itself, the object type, it's hash, and if it is conflicted
- // only paths that pass the hash check will be recored as a new path found
- // this should in theory prevent matching paths from being recorded
- static void record_path(const char* path, pdf_obj *obj)
- {
- unsigned int pathHash;
- int hashConflict;
- hashConflict = 0;
- pathHash = hash(path);
- if (pathHashTable[pathHash] == 1)
- {
- hashConflict = 1;
- pathHashConflicts++;
- }
- else
- {
- pathCounter++;
- pathHashTable[pathHash] = 1;
- }
- if ( (hashConflict == 0) && (outputFlag == 1) )
- {
- printf("%s\n", path);
- }
- else if ( (hashConflict == 0) && (outputFlag == 2) )
- {
- printf("(%d) %s\n", pathCounter, path);
- printf(" TYPE = ");
- write_obj_type(ctx, out, obj);
- printf("\n");
- }
- else if (outputFlag == 3)
- {
- if (hashConflict == 0)
- printf("(%d) %s\n", pathCounter, path);
- else
- printf("(CONFLICT) %s\n", path);
- printf(" TYPE = ");
- write_obj_type(ctx, out, obj);
- printf("\n HASH = %d\n", hash(path));
- }
- }
- // -----------------------------------------
- // this function takes a pdf object, a string prefix, and a string name
- // it will then build a new prefix, combining prefix and name, assuming name is not empty
- // then will traverse all sub-objects, i.e. contained in dictionaries and arrays
- // indirect references will also be traversed if they have not been already
- // non-conflicting paths will be recorded as a 'good' path
- static void traverse_node(pdf_obj *obj, const char* prefix, const char *name)
- {
- int i, n, continue_traverse;
- char newPrefix[1024], newName[1024];
- strcpy(newPrefix, prefix);
- if ( ( strlen(name) > 0 ) &&
- ( pdf_is_indirect(ctx, obj) || pdf_is_dict(ctx, obj) || pdf_is_array(ctx, obj) ) )
- {
- strcat(newPrefix, "/");
- strcat(newPrefix, name);
- record_path(newPrefix, obj);
- }
- continue_traverse = 1;
- if (pdf_is_indirect(ctx, obj))
- {
- if (objHashTable[pdf_to_num(ctx, obj)] == 1)
- continue_traverse = 0;
- }
- if (continue_traverse == 1)
- {
- if (pdf_is_indirect(ctx, obj))
- {
- objHashTable[pdf_to_num(ctx, obj)] = 1;
- }
- if (pdf_is_dict(ctx, obj))
- {
- n = pdf_dict_len(ctx, obj);
- for (i = 0; i < n; i++)
- {
- strcpy(newName, pdf_to_name(ctx, pdf_dict_get_key(ctx, obj, i)));
- traverse_node(pdf_dict_get_val(ctx, obj, i), newPrefix, newName);
- }
- }
- if (pdf_is_array(ctx, obj))
- {
- n = pdf_array_len(ctx, obj);
- for (i = 0; i < n; i++)
- {
- strcpy(newName, pdf_to_name(ctx, pdf_array_get(ctx, obj, i)));
- traverse_node(pdf_array_get(ctx, obj, i), newPrefix, newName);
- }
- }
- }
- }
- // -----------------------------------------
- // starting point for traversal
- // clears the hash table and object reference tables
- // starts traversal from root node, i.e. node 1
- static void traverse_nodes_start()
- {
- int i;
- if (!doc)
- fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
- pathHashConflicts = 0;
- objHashTableSize = pdf_count_objects(ctx, doc);
- objHashTable = malloc(sizeof(char) * (objHashTableSize + 1));
- for (i = 0; i < (objHashTableSize + 1); i++)
- objHashTable[i] = 0;
- for (i = 0; i < PATH_HASH_TABLE_SIZE; i++)
- pathHashTable[i] = 0;
- traverse_node(pdf_load_object(ctx, doc, 1), "", "Root");
- free(objHashTable);
- }
- // -----------------------------------------
- // main operations function
- // loads a PDF from a filename, and begins traversal
- // based on outputflag will indicate information upon completion
- int show_pdf_paths(const char *filename, int flag)
- {
- char *password = NULL; /* don't throw errors if encrypted */
- char *output = NULL;
- outputFlag = flag;
- ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
- if (!ctx)
- {
- printf("cannot initialise context\n");
- exit(1);
- }
- if (output)
- out = fz_new_output_with_path(ctx, output, 0);
- else
- out = fz_stdout(ctx);
- fz_var(doc);
- fz_try(ctx)
- {
- doc = pdf_open_document(ctx, filename);
- if (pdf_needs_password(ctx, doc))
- if (!pdf_authenticate_password(ctx, doc, password))
- fz_warn(ctx, "cannot authenticate password: %s", filename);
- printf("Reading From PDF, Filename = %s\n", filename);
- traverse_nodes_start();
- printf("Paths Counted = %d\n", pathCounter);
- if (outputFlag == 3)
- {
- printf("Hash Size = %d\n", PATH_HASH_TABLE_SIZE);
- printf("Hash Conflicts = %d\n", pathHashConflicts);
- }
- fz_close_output(ctx, out);
- }
- fz_catch(ctx)
- {
- printf("some bad error was caught, exiting\n");
- exit(1);
- }
- fz_drop_output(ctx, out);
- pdf_drop_document(ctx, doc);
- fz_drop_context(ctx);
- return 0;
- }
- // -----------------------------------------
- // stub main function
- // gets filename and output flag from command line parameters
- int main(int argc, char **argv)
- {
- int flag;
- char *filename;
- if ( (argc < 2) || (argc > 3) )
- printf("missing filename parameter\n");
- else
- {
- filename = argv[1];
- flag = 0;
- if (argc == 3)
- flag = atoi(argv[2]);
- show_pdf_paths(filename, flag);
- }
- }
- // -----------------------------------------
- // THE END
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement