Guest User

Untitled

a guest
Dec 15th, 2017
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.88 KB | None | 0 0
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Text;
  5. using Datalogics.PDFL;
  6.  
  7. /*
  8. *
  9. * A sample which demonstrates splitting a PDF document based on page intervals or bookmarks or by hits on
  10. * key search strings. To split a document, the application needs to create a new, empty document and insert pages
  11. * from the source document into the target documents(s)
  12. *
  13. * This type of application/process might be used for splitting consolidated statement type reports -
  14. * for example, a 1000 page financial PDF that is comprised of smaller 3-5 page reports representing individual accounts.
  15. *
  16. * Copyright (c) 2007-2010, Datalogics, Inc. All rights reserved.
  17. *
  18. * The information and code in this sample is for the exclusive use of Datalogics
  19. * customers and evaluation users only. Datalogics permits you to use, modify and
  20. * distribute this file in accordance with the terms of your license agreement.
  21. * Sample code is for demonstrative purposes only and is not intended for production use.
  22. *
  23. */
  24.  
  25.  
  26. namespace SplitPDFVariations
  27. {
  28. class SplitPDFVariations
  29. {
  30. static void Main(string[] args)
  31. {
  32. string inputFile = "..\\..\\Resources\\Sample_Input\\Constitution.pdf"; // input document
  33.  
  34. bool splitByBookmarks = false; // extract by bookmarks if they exist
  35. //
  36. bool splitByTextString = false; // extract by specified search string
  37. string splitTextString = "BREF APER�U"; // string to search for
  38. bool splitByPageInterval = true; // extract by specified number of page interval
  39. int splitPageInterval = 2; // page interval to use
  40.  
  41. List<int> listOfPageNumsToSplit = new List<int>();
  42.  
  43. using (Library lib = new Library())
  44. {
  45. Console.WriteLine("Initialized the library.");
  46.  
  47. Document doc = new Document(inputFile); //
  48. Console.WriteLine("Opened document " + inputFile);
  49.  
  50. if (splitByTextString)
  51. {
  52. FindTextUntagged(doc, splitTextString, listOfPageNumsToSplit);
  53. }
  54. else if (splitByBookmarks)
  55. {
  56. Bookmark rootBookmark = doc.BookmarkRoot;
  57. Console.WriteLine("Number of bookmarks = " + rootBookmark.Count);
  58. EnumerateBookmarks(rootBookmark, listOfPageNumsToSplit);
  59. }
  60. else if (splitByPageInterval)
  61. {
  62. FindPageSets(doc, splitPageInterval, listOfPageNumsToSplit);
  63. }
  64.  
  65. if (listOfPageNumsToSplit.Count > 0)
  66. SplitPDF(doc, listOfPageNumsToSplit);
  67. else
  68. Console.WriteLine("No pages to split. Exiting.");
  69.  
  70. }
  71. }
  72.  
  73. static void SplitPDF(Document doc, List<int> listOfPageNumsToSplit)
  74. {
  75. int numFiles = listOfPageNumsToSplit.Count;
  76. int numPagesToSplit = 0;
  77. Console.WriteLine("Splitting into " + numFiles + " files.");
  78. try
  79. {
  80. for (int j = 0; j < numFiles; j++)
  81. {
  82. Document outDoc = new Document();
  83. if (j < numFiles - 1)
  84. numPagesToSplit = listOfPageNumsToSplit[j + 1] - listOfPageNumsToSplit[j];
  85. else
  86. numPagesToSplit = doc.NumPages - listOfPageNumsToSplit[j];
  87.  
  88. outDoc.InsertPages(Document.BeforeFirstPage, doc, listOfPageNumsToSplit[j], numPagesToSplit, PageInsertFlags.Bookmarks | PageInsertFlags.Threads);
  89. outDoc.Save(SaveFlags.Full, "Split" + j + ".pdf");
  90. }
  91. }
  92. catch (ApplicationException ae)
  93. {
  94. Console.WriteLine(ae.Message);
  95. }
  96. }
  97.  
  98. static void EnumerateBookmarks(Bookmark bMark, List<int> listOfPageNumsToSplit)
  99. {
  100. if (bMark != null)
  101. {
  102. Console.WriteLine("Bookmark Title: " + bMark.Title);
  103. ViewDestination vDest = bMark.ViewDestination;
  104. int count = 0;
  105.  
  106. if (vDest != null)
  107. {
  108. Console.WriteLine("Bookmark Destination = page: " + vDest.PageNumber);
  109.  
  110. // Multiple bookmarks can point to the same destination page, so skip repeats
  111. if (listOfPageNumsToSplit.Contains(vDest.PageNumber) == false)
  112. listOfPageNumsToSplit.Add(vDest.PageNumber);
  113. count++;
  114. }
  115. EnumerateBookmarks(bMark.FirstChild, listOfPageNumsToSplit);
  116. EnumerateBookmarks(bMark.Next, listOfPageNumsToSplit);
  117. }
  118. }
  119.  
  120.  
  121. static void FindPageSets(Document doc, int splitPageInterval, List<int> listOfPageNumsToSplit)
  122. {
  123. int nPages = doc.NumPages;
  124.  
  125. // PDF page numbers are 0 based (add 1 to get the user sequential page number).
  126. // Get the modulo (remainder). If the remainder is 0, then split on that page.
  127. // For example: 5 page document, split interval of 2, you want to split the
  128. // document at pages 0, 2, 4 (internal PDF page number) a.k.a pages 1, 3, 5.
  129.  
  130. if (splitPageInterval < 1)
  131. splitPageInterval = 1; // prevents invalid split interval / divide by 0 problems
  132.  
  133. listOfPageNumsToSplit.Add(0); //Always split on the first page (page 0)
  134.  
  135. for (int i = 1; i < doc.NumPages; i++)
  136. {
  137. if (i % splitPageInterval == 0)
  138. listOfPageNumsToSplit.Add(i);
  139. }
  140. }
  141.  
  142.  
  143. /* This function is copied primarily from the TextExtract sample,
  144. * but modified to skip writing out the text that it finds
  145. */
  146. static void FindTextUntagged(Document doc, String splitTextString, List<int> listOfPageNumsToSplit)
  147. {
  148. // setup the WordFinderConfig
  149. WordFinderConfig wordConfig = new WordFinderConfig();
  150. wordConfig.IgnoreCharGaps = false;
  151. wordConfig.IgnoreLineGaps = false;
  152. wordConfig.NoAnnots = false;
  153. wordConfig.NoEncodingGuess = false;
  154. // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
  155. wordConfig.UnknownToStdEnc = false;
  156. wordConfig.DisableTaggedPDF = false; // legacy mode WordFinder creation
  157. wordConfig.NoXYSort = true;
  158. wordConfig.PreserveSpaces = false;
  159. wordConfig.NoLigatureExp = false;
  160. wordConfig.NoHyphenDetection = false;
  161. wordConfig.TrustNBSpace = false;
  162. wordConfig.NoExtCharOffset = false; // text extraction efficiency
  163. wordConfig.NoStyleInfo = false; // text extraction efficiency
  164.  
  165. WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);
  166.  
  167. int nPages = doc.NumPages;
  168. IList<Word> pageWords = null;
  169.  
  170. for (int i = 0; i < nPages; i++)
  171. {
  172. pageWords = wordFinder.GetWordList(i);
  173.  
  174. String textToExtract = "";
  175.  
  176. // By default, this searches the entire page word list.
  177. // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the
  178. // search string will fall within a certain number of words. If you wanted to only look within
  179. // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box
  180. // of each Word and compare that to your target area.
  181. int wordLoop = Math.Min(pageWords.Count,200);
  182.  
  183. for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
  184. //for (int wordnum = 0; wordnum < wordLoop; wordnum++) // limit by the fixt X number of Words
  185. {
  186. Word wInfo;
  187. wInfo = pageWords[wordnum];
  188. string s = wInfo.Text;
  189.  
  190. // Check for hyphenated words that break across a line.
  191. if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
  192. ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
  193. {
  194. // For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check
  195. // words against a dictionary to determine if the hyphenated word is actually one word or two.
  196. string[] splitstrs = s.Split(new Char[] {'-', '\u00ad'});
  197. textToExtract += splitstrs[0] + splitstrs[1];
  198. }
  199. else
  200. textToExtract += s;
  201.  
  202. // Check for space adjacency and add a space if necessary.
  203. if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
  204. {
  205. textToExtract += " ";
  206. }
  207. // Check for a line break and add one if necessary
  208. if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
  209. textToExtract += "\n";
  210. }
  211.  
  212. //
  213. if (textToExtract.ToUpper().Contains(splitTextString))
  214. {
  215. Console.WriteLine("Found " + splitTextString + " on page " + i);
  216. listOfPageNumsToSplit.Add(i);
  217. }
  218.  
  219. // Release requested WordList
  220. for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
  221. pageWords[wordnum].Dispose();
  222. }
  223. }
  224.  
  225. }
  226. }
Add Comment
Please, Sign In to add comment