Advertisement
teyu321

Untitled

Jan 24th, 2018
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 7.30 KB | None | 0 0
  1. // -----------------------------------------
  2.  
  3. #include "mupdf/pdf.h"
  4.  
  5. #include <stdlib.h>
  6. #include <stdio.h>
  7. #include <string.h>
  8. #include <limits.h>
  9.  
  10. // value needs to be sufficently large to prevent accidental hash value matches
  11. // default value, (1024 * 1024) is a full megabyte and should be big enough for most cases
  12. // given a large enough PDF this could cause issues, currently unknown
  13. #define PATH_HASH_TABLE_SIZE ( 1024 * 1024)
  14.  
  15. // -----------------------------------------
  16.  
  17. static pdf_document *doc = NULL;
  18. static fz_context *ctx = NULL;
  19. static fz_output *out = NULL;
  20.  
  21. static char *objHashTable = NULL;
  22. static unsigned int objHashTableSize = 0;
  23.  
  24. static char pathHashTable[PATH_HASH_TABLE_SIZE];
  25. static unsigned int pathCounter = 0;
  26. static unsigned int pathHashConflicts = 0;
  27.  
  28. static short outputFlag = 0;
  29.  
  30. // -----------------------------------------
  31.  
  32. // this function takes a string and creates a hash value from it
  33. // expectation is only PDF paths will be passed to this
  34. unsigned int hash(const char *word)
  35. {
  36.     unsigned int hashval;
  37.     for (hashval = 0; *word != '\0'; word++)
  38.         hashval = *word + 31 * hashval;
  39.     return hashval % PATH_HASH_TABLE_SIZE;
  40. }
  41.  
  42. // -----------------------------------------
  43.  
  44. // this function takes a pdf object and will write all of it's types
  45. // note that objects can be of more than one type at the same time
  46. static void write_obj_type(fz_context *ctx, fz_output *out, pdf_obj *obj)
  47. {
  48.     if (!obj)
  49.         printf("/NOT AN OBJECT");
  50.     else {
  51.         if (pdf_is_indirect(ctx, obj))
  52.             printf("/INDIRECT");
  53.         if (pdf_is_null(ctx, obj))
  54.             printf("/NULL");
  55.         if (pdf_is_bool(ctx, obj))
  56.             printf("/BOOL");
  57.         if (pdf_is_int(ctx, obj))
  58.             printf("/INT");
  59.         if (pdf_is_real(ctx, obj))
  60.             printf("/REAL");
  61.         if (pdf_is_string(ctx, obj))
  62.             printf("/STRING");
  63.         if (pdf_is_name(ctx, obj))
  64.             printf("/NAME");
  65.         if (pdf_is_array(ctx, obj))
  66.             printf("/ARRAY");
  67.         if (pdf_is_dict(ctx, obj))
  68.             printf("/DICT");
  69.         if (pdf_is_stream(ctx, obj))
  70.             printf("/PDFSTREAM");
  71.     }
  72. }
  73.  
  74. // -----------------------------------------
  75.  
  76. // this function takes a path and a pdf object
  77. // it will then check to see if this path has been added to the hash table
  78. // if it has then it will be noted as a hash conflict
  79. // depending on the output flag level the path will be output
  80. // the path itself, the object type, it's hash, and if it is conflicted
  81. // only paths that pass the hash check will be recored as a new path found
  82. // this should in theory prevent matching paths from being recorded
  83. static void record_path(const char* path, pdf_obj *obj)
  84. {
  85.     unsigned int pathHash;
  86.     int hashConflict;
  87.     hashConflict = 0;
  88.     pathHash = hash(path);
  89.     if (pathHashTable[pathHash] == 1)
  90.     {
  91.         hashConflict = 1;
  92.         pathHashConflicts++;
  93.     }
  94.     else
  95.     {
  96.         pathCounter++;
  97.         pathHashTable[pathHash] = 1;
  98.     }
  99.     if ( (hashConflict == 0) && (outputFlag == 1) )
  100.     {
  101.         printf("%s\n", path);
  102.     }
  103.     else if ( (hashConflict == 0) && (outputFlag == 2) )
  104.     {
  105.         printf("(%d) %s\n", pathCounter, path);
  106.         printf("   TYPE = ");
  107.         write_obj_type(ctx, out, obj);
  108.         printf("\n");
  109.     }
  110.     else if (outputFlag == 3)
  111.     {
  112.         if (hashConflict == 0)
  113.             printf("(%d) %s\n", pathCounter, path);
  114.         else
  115.             printf("(CONFLICT) %s\n", path);
  116.         printf("   TYPE = ");
  117.         write_obj_type(ctx, out, obj);
  118.         printf("\n   HASH = %d\n", hash(path));
  119.     }
  120. }
  121.  
  122. // -----------------------------------------
  123.  
  124. // this function takes a pdf object, a string prefix, and a string name
  125. // it will then build a new prefix, combining prefix and name, assuming name is not empty
  126. // then will traverse all sub-objects, i.e. contained in dictionaries and arrays
  127. // indirect references will also be traversed if they have not been already
  128. // non-conflicting paths will be recorded as a 'good' path
  129. static void traverse_node(pdf_obj *obj, const char* prefix, const char *name)
  130. {
  131.     int i, n, continue_traverse;
  132.     char newPrefix[1024], newName[1024];
  133.     strcpy(newPrefix, prefix);
  134.     if ( ( strlen(name) > 0 ) &&
  135.             ( pdf_is_indirect(ctx, obj) || pdf_is_dict(ctx, obj) || pdf_is_array(ctx, obj) ) )
  136.     {
  137.         strcat(newPrefix, "/");
  138.         strcat(newPrefix, name);
  139.         record_path(newPrefix, obj);
  140.     }
  141.     continue_traverse = 1;
  142.     if (pdf_is_indirect(ctx, obj))
  143.     {
  144.         if (objHashTable[pdf_to_num(ctx, obj)] == 1)
  145.             continue_traverse = 0;
  146.     }
  147.     if (continue_traverse == 1)
  148.     {
  149.         if (pdf_is_indirect(ctx, obj))
  150.         {
  151.             objHashTable[pdf_to_num(ctx, obj)] = 1;
  152.         }
  153.         if (pdf_is_dict(ctx, obj))
  154.         {
  155.             n = pdf_dict_len(ctx, obj);
  156.             for (i = 0; i < n; i++)
  157.             {
  158.                 strcpy(newName, pdf_to_name(ctx, pdf_dict_get_key(ctx, obj, i)));
  159.                 traverse_node(pdf_dict_get_val(ctx, obj, i), newPrefix, newName);
  160.             }
  161.         }
  162.         if (pdf_is_array(ctx, obj))
  163.         {
  164.             n = pdf_array_len(ctx, obj);
  165.             for (i = 0; i < n; i++)
  166.             {
  167.                 strcpy(newName, pdf_to_name(ctx, pdf_array_get(ctx, obj, i)));
  168.                 traverse_node(pdf_array_get(ctx, obj, i), newPrefix, newName);
  169.             }
  170.         }
  171.     }
  172.  
  173. }
  174.  
  175. // -----------------------------------------
  176.  
  177. // starting point for traversal
  178. // clears the hash table and object reference tables
  179. // starts traversal from root node, i.e. node 1
  180. static void traverse_nodes_start()
  181. {
  182.     int i;
  183.     if (!doc)
  184.         fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
  185.     pathHashConflicts = 0;
  186.     objHashTableSize = pdf_count_objects(ctx, doc);
  187.     objHashTable = malloc(sizeof(char) * (objHashTableSize + 1));
  188.     for (i = 0; i < (objHashTableSize + 1); i++)
  189.         objHashTable[i] = 0;
  190.     for (i = 0; i < PATH_HASH_TABLE_SIZE; i++)
  191.         pathHashTable[i] = 0;
  192.     traverse_node(pdf_load_object(ctx, doc, 1), "", "Root");
  193.     free(objHashTable);
  194. }
  195.  
  196. // -----------------------------------------
  197.  
  198. // main operations function
  199. // loads a PDF from a filename, and begins traversal
  200. // based on outputflag will indicate information upon completion
  201. int show_pdf_paths(const char *filename, int flag)
  202. {
  203.     char *password = NULL; /* don't throw errors if encrypted */
  204.     char *output = NULL;
  205.     outputFlag = flag;
  206.  
  207.     ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
  208.     if (!ctx)
  209.     {
  210.         printf("cannot initialise context\n");
  211.         exit(1);
  212.     }
  213.  
  214.     if (output)
  215.         out = fz_new_output_with_path(ctx, output, 0);
  216.     else
  217.         out = fz_stdout(ctx);
  218.  
  219.     fz_var(doc);
  220.     fz_try(ctx)
  221.     {
  222.         doc = pdf_open_document(ctx, filename);
  223.         if (pdf_needs_password(ctx, doc))
  224.             if (!pdf_authenticate_password(ctx, doc, password))
  225.                 fz_warn(ctx, "cannot authenticate password: %s", filename);
  226.         printf("Reading From PDF, Filename = %s\n", filename);
  227.         traverse_nodes_start();
  228.         printf("Paths Counted = %d\n", pathCounter);
  229.         if (outputFlag == 3)
  230.         {
  231.             printf("Hash Size = %d\n", PATH_HASH_TABLE_SIZE);
  232.             printf("Hash Conflicts = %d\n", pathHashConflicts);
  233.         }
  234.         fz_close_output(ctx, out);
  235.     }
  236.     fz_catch(ctx)
  237.     {
  238.         printf("some bad error was caught, exiting\n");
  239.         exit(1);
  240.     }
  241.  
  242.     fz_drop_output(ctx, out);
  243.     pdf_drop_document(ctx, doc);
  244.     fz_drop_context(ctx);
  245.     return 0;
  246. }
  247.  
  248. // -----------------------------------------
  249.  
  250. // stub main function
  251. // gets filename and output flag from command line parameters
  252. int main(int argc, char **argv)
  253. {
  254.     int flag;
  255.     char *filename;
  256.     if ( (argc < 2) || (argc > 3) )
  257.         printf("missing filename parameter\n");
  258.     else
  259.     {
  260.         filename = argv[1];
  261.         flag = 0;
  262.         if (argc == 3)
  263.             flag = atoi(argv[2]);
  264.         show_pdf_paths(filename, flag);
  265.     }
  266.  
  267. }
  268.  
  269. // -----------------------------------------
  270.  
  271. // THE END
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement