Untitled

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Datalogics.PDFL;

/*
 *
 * A sample which demonstrates splitting a PDF document based on page intervals or bookmarks or by hits on
 * key search strings. To split a document, the application needs to create a new, empty document and insert pages
 * from the source document into the target documents(s)
 *
 * This type of application/process might be used for splitting consolidated statement type reports -
 * for example, a 1000 page financial PDF that is comprised of smaller 3-5 page reports representing individual accounts.
 *
 * Copyright (c) 2007-2010, Datalogics, Inc. All rights reserved.
 *
 * The information and code in this sample is for the exclusive use of Datalogics
 * customers and evaluation users only.  Datalogics permits you to use, modify and
 * distribute this file in accordance with the terms of your license agreement.
 * Sample code is for demonstrative purposes only and is not intended for production use.
 *
 */


namespace SplitPDFVariations
{
    class SplitPDFVariations
    {
        static void Main(string[] args)
        {
            string inputFile = "..\\..\\Resources\\Sample_Input\\Constitution.pdf";  // input document

            bool splitByBookmarks = false;           // extract by bookmarks if they exist
                                                     //
            bool splitByTextString = false;          // extract by specified search string
            string splitTextString = "BREF APER�U";  //         string to search for
            bool splitByPageInterval = true;         // extract by specified number of page interval
            int splitPageInterval = 2;               //         page interval to use

            List<int> listOfPageNumsToSplit = new List<int>();

            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");

                Document doc = new Document(inputFile); //
                Console.WriteLine("Opened document " + inputFile);

                if (splitByTextString)
                {
                    FindTextUntagged(doc, splitTextString, listOfPageNumsToSplit);
                }
                else if (splitByBookmarks)
                {
                    Bookmark rootBookmark = doc.BookmarkRoot;
                    Console.WriteLine("Number of bookmarks = " + rootBookmark.Count);
                    EnumerateBookmarks(rootBookmark, listOfPageNumsToSplit);
                }
                else if (splitByPageInterval)
                {
                    FindPageSets(doc, splitPageInterval, listOfPageNumsToSplit);
                }

                if (listOfPageNumsToSplit.Count > 0)
                    SplitPDF(doc, listOfPageNumsToSplit);
                else
                    Console.WriteLine("No pages to split. Exiting.");

            }
        }

        static void SplitPDF(Document doc,  List<int> listOfPageNumsToSplit)
        {
            int numFiles = listOfPageNumsToSplit.Count;
            int numPagesToSplit = 0;
            Console.WriteLine("Splitting into " + numFiles + " files.");
            try
            {
                for (int j = 0; j < numFiles; j++)
                {
                    Document outDoc = new Document();
                    if (j < numFiles - 1)
                        numPagesToSplit = listOfPageNumsToSplit[j + 1] - listOfPageNumsToSplit[j];
                    else
                        numPagesToSplit = doc.NumPages - listOfPageNumsToSplit[j];

                    outDoc.InsertPages(Document.BeforeFirstPage, doc, listOfPageNumsToSplit[j], numPagesToSplit, PageInsertFlags.Bookmarks | PageInsertFlags.Threads);
                    outDoc.Save(SaveFlags.Full, "Split" + j + ".pdf");
                }
            }
            catch (ApplicationException ae)
            {
                Console.WriteLine(ae.Message);
            }
        }

        static void EnumerateBookmarks(Bookmark bMark, List<int> listOfPageNumsToSplit)
        {
            if (bMark != null)
            {
                Console.WriteLine("Bookmark Title: " + bMark.Title);
                ViewDestination vDest = bMark.ViewDestination;
                int count = 0;

                if (vDest != null)
                {
                    Console.WriteLine("Bookmark Destination = page: " + vDest.PageNumber);

                    // Multiple bookmarks can point to the same destination page, so skip repeats
                    if (listOfPageNumsToSplit.Contains(vDest.PageNumber) == false)
                        listOfPageNumsToSplit.Add(vDest.PageNumber);
                    count++;
                }
                EnumerateBookmarks(bMark.FirstChild, listOfPageNumsToSplit);
			    EnumerateBookmarks(bMark.Next, listOfPageNumsToSplit);
            }
        }


        static void FindPageSets(Document doc,  int splitPageInterval, List<int> listOfPageNumsToSplit)
        {
            int nPages = doc.NumPages;

            // PDF page numbers are 0 based (add 1 to get the user sequential page number).
            // Get the modulo (remainder). If the remainder is 0, then split on that page.
            // For example: 5 page document, split interval of 2, you want to split the
            // document at pages 0, 2, 4 (internal PDF page number) a.k.a pages 1, 3, 5.

            if (splitPageInterval < 1)
                splitPageInterval = 1;  // prevents invalid split interval / divide by 0 problems

            listOfPageNumsToSplit.Add(0);   //Always split on the first page (page 0)

            for (int i = 1; i < doc.NumPages; i++)
            {
                if (i % splitPageInterval == 0)
                    listOfPageNumsToSplit.Add(i);
            }
        }


        /* This function is copied primarily from the TextExtract sample,
         * but modified to skip writing out the text that it finds
         */
       static void FindTextUntagged(Document doc, String splitTextString, List<int> listOfPageNumsToSplit)
        {
            // setup the WordFinderConfig
            WordFinderConfig wordConfig = new WordFinderConfig();
            wordConfig.IgnoreCharGaps = false;
            wordConfig.IgnoreLineGaps = false;
            wordConfig.NoAnnots = false;
            wordConfig.NoEncodingGuess = false;
            // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
            wordConfig.UnknownToStdEnc = false;
            wordConfig.DisableTaggedPDF = false;    // legacy mode WordFinder creation
            wordConfig.NoXYSort = true;
            wordConfig.PreserveSpaces = false;
            wordConfig.NoLigatureExp = false;
            wordConfig.NoHyphenDetection = false;
            wordConfig.TrustNBSpace = false;
            wordConfig.NoExtCharOffset = false;     // text extraction efficiency
            wordConfig.NoStyleInfo = false;         // text extraction efficiency

            WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);

            int nPages = doc.NumPages;
            IList<Word> pageWords = null;

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                // By default, this searches the entire page word list.
                // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the
                // search string will fall within a certain number of words.  If you wanted to only look within
                // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box
                // of each Word and compare that to your target area.
                int wordLoop = Math.Min(pageWords.Count,200);

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                //for (int wordnum = 0; wordnum < wordLoop; wordnum++)  // limit by the fixt X number of Words
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // Check for hyphenated words that break across a line.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
                        ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
                    {
                        // For the purposes of this sample, we'll remove all hyphens.  In practice, you may need to check
                        // words against a dictionary to determine if the hyphenated word is actually one word or two.
                        string[] splitstrs = s.Split(new Char[] {'-', '\u00ad'});
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                        textToExtract += s;

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary
                    if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
                        textToExtract += "\n";
                }

                //
                if (textToExtract.ToUpper().Contains(splitTextString))
                    {
                    Console.WriteLine("Found " + splitTextString + " on page " + i);
                    listOfPageNumsToSplit.Add(i);
                }

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                    pageWords[wordnum].Dispose();
            }
        }

   }
}