Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Text;
- using Datalogics.PDFL;
- /*
- *
- * A sample which demonstrates splitting a PDF document based on page intervals or bookmarks or by hits on
- * key search strings. To split a document, the application needs to create a new, empty document and insert pages
- * from the source document into the target documents(s)
- *
- * This type of application/process might be used for splitting consolidated statement type reports -
- * for example, a 1000 page financial PDF that is comprised of smaller 3-5 page reports representing individual accounts.
- *
- * Copyright (c) 2007-2010, Datalogics, Inc. All rights reserved.
- *
- * The information and code in this sample is for the exclusive use of Datalogics
- * customers and evaluation users only. Datalogics permits you to use, modify and
- * distribute this file in accordance with the terms of your license agreement.
- * Sample code is for demonstrative purposes only and is not intended for production use.
- *
- */
- namespace SplitPDFVariations
- {
- class SplitPDFVariations
- {
- static void Main(string[] args)
- {
- string inputFile = "..\\..\\Resources\\Sample_Input\\Constitution.pdf"; // input document
- bool splitByBookmarks = false; // extract by bookmarks if they exist
- //
- bool splitByTextString = false; // extract by specified search string
- string splitTextString = "BREF APER�U"; // string to search for
- bool splitByPageInterval = true; // extract by specified number of page interval
- int splitPageInterval = 2; // page interval to use
- List<int> listOfPageNumsToSplit = new List<int>();
- using (Library lib = new Library())
- {
- Console.WriteLine("Initialized the library.");
- Document doc = new Document(inputFile); //
- Console.WriteLine("Opened document " + inputFile);
- if (splitByTextString)
- {
- FindTextUntagged(doc, splitTextString, listOfPageNumsToSplit);
- }
- else if (splitByBookmarks)
- {
- Bookmark rootBookmark = doc.BookmarkRoot;
- Console.WriteLine("Number of bookmarks = " + rootBookmark.Count);
- EnumerateBookmarks(rootBookmark, listOfPageNumsToSplit);
- }
- else if (splitByPageInterval)
- {
- FindPageSets(doc, splitPageInterval, listOfPageNumsToSplit);
- }
- if (listOfPageNumsToSplit.Count > 0)
- SplitPDF(doc, listOfPageNumsToSplit);
- else
- Console.WriteLine("No pages to split. Exiting.");
- }
- }
- static void SplitPDF(Document doc, List<int> listOfPageNumsToSplit)
- {
- int numFiles = listOfPageNumsToSplit.Count;
- int numPagesToSplit = 0;
- Console.WriteLine("Splitting into " + numFiles + " files.");
- try
- {
- for (int j = 0; j < numFiles; j++)
- {
- Document outDoc = new Document();
- if (j < numFiles - 1)
- numPagesToSplit = listOfPageNumsToSplit[j + 1] - listOfPageNumsToSplit[j];
- else
- numPagesToSplit = doc.NumPages - listOfPageNumsToSplit[j];
- outDoc.InsertPages(Document.BeforeFirstPage, doc, listOfPageNumsToSplit[j], numPagesToSplit, PageInsertFlags.Bookmarks | PageInsertFlags.Threads);
- outDoc.Save(SaveFlags.Full, "Split" + j + ".pdf");
- }
- }
- catch (ApplicationException ae)
- {
- Console.WriteLine(ae.Message);
- }
- }
- static void EnumerateBookmarks(Bookmark bMark, List<int> listOfPageNumsToSplit)
- {
- if (bMark != null)
- {
- Console.WriteLine("Bookmark Title: " + bMark.Title);
- ViewDestination vDest = bMark.ViewDestination;
- int count = 0;
- if (vDest != null)
- {
- Console.WriteLine("Bookmark Destination = page: " + vDest.PageNumber);
- // Multiple bookmarks can point to the same destination page, so skip repeats
- if (listOfPageNumsToSplit.Contains(vDest.PageNumber) == false)
- listOfPageNumsToSplit.Add(vDest.PageNumber);
- count++;
- }
- EnumerateBookmarks(bMark.FirstChild, listOfPageNumsToSplit);
- EnumerateBookmarks(bMark.Next, listOfPageNumsToSplit);
- }
- }
- static void FindPageSets(Document doc, int splitPageInterval, List<int> listOfPageNumsToSplit)
- {
- int nPages = doc.NumPages;
- // PDF page numbers are 0 based (add 1 to get the user sequential page number).
- // Get the modulo (remainder). If the remainder is 0, then split on that page.
- // For example: 5 page document, split interval of 2, you want to split the
- // document at pages 0, 2, 4 (internal PDF page number) a.k.a pages 1, 3, 5.
- if (splitPageInterval < 1)
- splitPageInterval = 1; // prevents invalid split interval / divide by 0 problems
- listOfPageNumsToSplit.Add(0); //Always split on the first page (page 0)
- for (int i = 1; i < doc.NumPages; i++)
- {
- if (i % splitPageInterval == 0)
- listOfPageNumsToSplit.Add(i);
- }
- }
- /* This function is copied primarily from the TextExtract sample,
- * but modified to skip writing out the text that it finds
- */
- static void FindTextUntagged(Document doc, String splitTextString, List<int> listOfPageNumsToSplit)
- {
- // setup the WordFinderConfig
- WordFinderConfig wordConfig = new WordFinderConfig();
- wordConfig.IgnoreCharGaps = false;
- wordConfig.IgnoreLineGaps = false;
- wordConfig.NoAnnots = false;
- wordConfig.NoEncodingGuess = false;
- // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
- wordConfig.UnknownToStdEnc = false;
- wordConfig.DisableTaggedPDF = false; // legacy mode WordFinder creation
- wordConfig.NoXYSort = true;
- wordConfig.PreserveSpaces = false;
- wordConfig.NoLigatureExp = false;
- wordConfig.NoHyphenDetection = false;
- wordConfig.TrustNBSpace = false;
- wordConfig.NoExtCharOffset = false; // text extraction efficiency
- wordConfig.NoStyleInfo = false; // text extraction efficiency
- WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);
- int nPages = doc.NumPages;
- IList<Word> pageWords = null;
- for (int i = 0; i < nPages; i++)
- {
- pageWords = wordFinder.GetWordList(i);
- String textToExtract = "";
- // By default, this searches the entire page word list.
- // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the
- // search string will fall within a certain number of words. If you wanted to only look within
- // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box
- // of each Word and compare that to your target area.
- int wordLoop = Math.Min(pageWords.Count,200);
- for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
- //for (int wordnum = 0; wordnum < wordLoop; wordnum++) // limit by the fixt X number of Words
- {
- Word wInfo;
- wInfo = pageWords[wordnum];
- string s = wInfo.Text;
- // Check for hyphenated words that break across a line.
- if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
- ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
- {
- // For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check
- // words against a dictionary to determine if the hyphenated word is actually one word or two.
- string[] splitstrs = s.Split(new Char[] {'-', '\u00ad'});
- textToExtract += splitstrs[0] + splitstrs[1];
- }
- else
- textToExtract += s;
- // Check for space adjacency and add a space if necessary.
- if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
- {
- textToExtract += " ";
- }
- // Check for a line break and add one if necessary
- if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
- textToExtract += "\n";
- }
- //
- if (textToExtract.ToUpper().Contains(splitTextString))
- {
- Console.WriteLine("Found " + splitTextString + " on page " + i);
- listOfPageNumsToSplit.Add(i);
- }
- // Release requested WordList
- for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
- pageWords[wordnum].Dispose();
- }
- }
- }
- }
Add Comment
Please, Sign In to add comment