Advertisement
Guest User

Untitled

a guest
Sep 12th, 2017
1,515
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 476.50 KB | None | 0 0
  1. <?php
  2. /**************************************************************************************************************
  3.  
  4. NAME
  5. PdfToText.phpclass
  6.  
  7. DESCRIPTION
  8. A class for extracting text from Pdf files.
  9. Usage is very simple : just instantiate a PdfToText object, specifying an input filename, then use the
  10. Text property to retrieve PDF textual contents :
  11.  
  12. $pdf = new PdfToText ( 'sample.pdf' ) ;
  13. echo $pdf -> Text ; // or : echo ( string ) $pdf ;
  14.  
  15. Or :
  16.  
  17. $pdf = new PdfToText ( ) ;
  18. // Modify any property here before loading the file ; for example :
  19. // $pdf -> BlockSeparator = " " ;
  20. $pdf -> Load ( 'sample.pdf' ) ;
  21. echo $pdf -> Text ;
  22.  
  23. AUTHOR
  24. Christian Vigh, 04/2016.
  25.  
  26. HISTORY
  27. [Version : 1.6.7] [Date : 2017/05/31] [Author : CV]
  28. . Added CID fonts
  29. . Changed the way CID font maps are searched and handled
  30.  
  31. (...)
  32.  
  33. [Version : 1.0] [Date : 2016/04/16] [Author : CV]
  34. Initial version.
  35.  
  36. **************************************************************************************************************/
  37.  
  38.  
  39. /*==============================================================================================================
  40.  
  41. class PdfToTextException et al -
  42. Implements an exception thrown when an error is encountered while decoding PDF files.
  43.  
  44. ==============================================================================================================*/
  45.  
  46. // PdfToText exception -
  47. // Base class for all other PdfToText exceptions.
  48. class PdfToTextException extends Exception
  49. {
  50. public static $IsObject = false ;
  51. } ;
  52.  
  53.  
  54. // PdfToTextDecodingException -
  55. // Thrown when unexpected data is encountered while analyzing PDF contents.
  56. class PdfToTextDecodingException extends PdfToTextException
  57. {
  58. public function __construct ( $message, $object_id = false )
  59. {
  60. $text = "Pdf decoding error" ;
  61.  
  62. if ( $object_id !== false )
  63. $text .= " (object #$object_id)" ;
  64.  
  65. $text .= " : $message" ;
  66.  
  67. parent::__construct ( $text ) ;
  68. }
  69. }
  70.  
  71.  
  72. // PdfToTextDecryptionException -
  73. // Thrown when something unexpected is encountered while processing encrypted data.
  74. class PdfToTextDecryptionException extends PdfToTextException
  75. {
  76. public function __construct ( $message, $object_id = false )
  77. {
  78. $text = "Pdf decryption error" ;
  79.  
  80. if ( $object_id !== false )
  81. $text .= " (object #$object_id)" ;
  82.  
  83. $text .= " : $message" ;
  84.  
  85. parent::__construct ( $text ) ;
  86. }
  87. }
  88.  
  89.  
  90. // PdfToTextTimeoutException -
  91. // Thrown when the PDFOPT_ENFORCE_EXECUTION_TIME or PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME option is set, and
  92. // the script took longer than the allowed execution time limit.
  93. class PdfToTextTimeoutException extends PdfToTextException
  94. {
  95. // Set to true if the reason why the max execution time was reached because of too many invocations of the Load() method
  96. // Set to false if the max execution time was reached by simply processing one PDF file
  97. public $GlobalTimeout ;
  98.  
  99. public function __construct ( $message, $global, $php_setting, $class_setting )
  100. {
  101. $text = "PdfToText max execution time reached " ;
  102.  
  103. if ( ! $global )
  104. $text .= "for one single file " ;
  105.  
  106. $text .= "(php limit = {$php_setting}s, class limit = {$class_setting}s) : $message" ;
  107.  
  108. $this -> GlobalTimeout = $global ;
  109.  
  110. parent::__construct ( $text ) ;
  111. }
  112. }
  113.  
  114.  
  115. // PdfToTextFormException -
  116. // Thrown if the xml template passed to the GetFormData() method contains an error.
  117. class PdfToTextFormException extends PdfToTextException
  118. {
  119. public function __construct ( $message )
  120. {
  121. $text = "Pdf form template error" ;
  122.  
  123. $text .= " : $message" ;
  124.  
  125. parent::__construct ( $text ) ;
  126. }
  127. }
  128.  
  129.  
  130. // PdfToTextCaptureException -
  131. // Thrown if the xml template passed to the SetCaptures() method contains an error.
  132. class PdfToTextCaptureException extends PdfToTextException
  133. {
  134. public function __construct ( $message )
  135. {
  136. $text = "Pdf capture template error" ;
  137.  
  138. $text .= " : $message" ;
  139.  
  140. parent::__construct ( $text ) ;
  141. }
  142. }
  143.  
  144.  
  145.  
  146. /*==============================================================================================================
  147.  
  148. Custom error reporting functions.
  149.  
  150. ==============================================================================================================*/
  151. if ( ! function_exists ( 'warning' ) )
  152. {
  153. function warning ( $message )
  154. {
  155. trigger_error ( $message, E_USER_WARNING ) ;
  156. }
  157. }
  158.  
  159.  
  160. if ( ! function_exists ( 'error' ) )
  161. {
  162. function error ( $message )
  163. {
  164. if ( is_string ( $message ) )
  165. trigger_error ( $message, E_USER_ERROR ) ;
  166. else if ( is_a ( $message, '\Exception' ) )
  167. throw $message ;
  168. }
  169. }
  170.  
  171.  
  172. /*==============================================================================================================
  173.  
  174. Backward-compatibility issues.
  175.  
  176. ==============================================================================================================*/
  177.  
  178. // hex2bin -
  179. // This function appeared only in version 5.4.0
  180. if ( ! function_exists ( 'hex2bin' ) )
  181. {
  182. function hex2bin ( $hexstring )
  183. {
  184. $length = strlen ( $hexstring ) ;
  185. $binstring = '' ;
  186. $index = 0 ;
  187.  
  188. while ( $index < $length )
  189. {
  190. $byte = substr ( $hexstring, $index, 2 ) ;
  191. $ch = pack ( 'H*', $byte ) ;
  192. $binstring .= $ch ;
  193.  
  194. $index += 2 ;
  195. }
  196.  
  197. return ( $binstring ) ;
  198. }
  199.  
  200. }
  201.  
  202.  
  203. /*==============================================================================================================
  204.  
  205. class PfObjectBase -
  206. Base class for all PDF objects defined here.
  207.  
  208. ==============================================================================================================*/
  209. abstract class PdfObjectBase // extends Object
  210. {
  211. // Possible encoding types for streams inside objects ; "unknown" means that the object contains no stream
  212. const PDF_UNKNOWN_ENCODING = 0 ; // No stream decoding type could be identified
  213. const PDF_ASCIIHEX_ENCODING = 1 ; // AsciiHex encoding - not tested
  214. const PDF_ASCII85_ENCODING = 2 ; // Ascii85 encoding - not tested
  215. const PDF_FLATE_ENCODING = 3 ; // Flate/deflate encoding
  216. const PDF_TEXT_ENCODING = 4 ; // Stream data appears in clear text - no decoding required
  217. const PDF_LZW_ENCODING = 5 ; // Not implemented yet
  218. const PDF_RLE_ENCODING = 6 ; // Runtime length encoding ; not implemented yet
  219. const PDF_DCT_ENCODING = 7 ; // JPEG images
  220. const PDF_CCITT_FAX_ENCODING = 8 ; // CCITT Fax encoding - not implemented yet
  221. const PDF_JBIG2_ENCODING = 9 ; // JBIG2 filter encoding (black/white) - not implemented yet
  222. const PDF_JPX_ENCODING = 10 ; // JPEG2000 encoding - not implemented yet
  223.  
  224. // Regular expression used for recognizing references to a font (this list is far from being exhaustive, as it seems
  225. // that you can specify almost everything - however, trying to recognize everything would require to develop a complete
  226. // parser)
  227. protected static $FontSpecifiers = '
  228. (/F \d+ (\.\d+)? ) |
  229. (/R \d+) |
  230. (/f-\d+-\d+) |
  231. (/[CT]\d+_\d+) |
  232. (/TT \d+) |
  233. (/OPBaseFont \d+) |
  234. (/OPSUFont \d+) |
  235. (/[0-9a-zA-Z]) |
  236. (/F\w+) |
  237. (/[A-Za-z][A-Za-z0-9]* ( [\-+] [A-Za-z][A-Za-z0-9]* ))
  238. ' ;
  239.  
  240. // Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent
  241. protected static $UnicodeToSimpleAscii = false ;
  242.  
  243.  
  244. /*--------------------------------------------------------------------------------------------------------------
  245.  
  246. Constructor -
  247. Performs static initializations such as the Unicode to Ascii table.
  248.  
  249. *-------------------------------------------------------------------------------------------------------------*/
  250. public function __construct ( )
  251. {
  252. if ( self::$UnicodeToSimpleAscii === false )
  253. {
  254. $charset_file = dirname ( __FILE__ ) . "/Maps/unicode-to-ansi.map" ;
  255. include ( $charset_file ) ;
  256. self::$UnicodeToSimpleAscii = ( isset ( $unicode_to_ansi ) ) ? $unicode_to_ansi : array ( ) ;
  257. }
  258.  
  259. // parent::__construct ( ) ;
  260. }
  261.  
  262.  
  263. /*--------------------------------------------------------------------------------------------------------------
  264.  
  265. NAME
  266. CodePointToUtf8 - Encodes a Unicode codepoint to UTF8.
  267.  
  268. PROTOTYPE
  269. $char = $this -> CodePointToUtf8 ( $code ) ;
  270.  
  271. DESCRIPTION
  272. Encodes a Unicode codepoint to UTF8, trying to handle all possible cases.
  273.  
  274. PARAMETERS
  275. $code (integer) -
  276. Unicode code point to be translated.
  277.  
  278. RETURN VALUE
  279. A string that contains the UTF8 bytes representing the Unicode code point.
  280.  
  281. *-------------------------------------------------------------------------------------------------------------*/
  282. protected function CodePointToUtf8 ( $code )
  283. {
  284. if ( $code )
  285. {
  286. $result = '' ;
  287.  
  288. while ( $code )
  289. {
  290. $word = ( $code & 0xFFFF ) ;
  291.  
  292. if ( ! isset ( self::$UnicodeToSimpleAscii [ $word ] ) )
  293. {
  294. $entity = "&#$word;" ;
  295. $result .= mb_convert_encoding ( $entity, 'UTF-8', 'HTML-ENTITIES' ) . $result ;
  296. }
  297. else
  298. $result .= self::$UnicodeToSimpleAscii [ $word ] ;
  299.  
  300. $code = ( integer ) ( $code / 0xFFFF ) ; // There is no unsigned right-shift operator in PHP...
  301. }
  302.  
  303. return ( $result ) ;
  304. }
  305. // No translation is apparently possible : use a placeholder to signal this situation
  306. else
  307. {
  308. if ( strpos ( PdfToText::$Utf8Placeholder, '%' ) === false )
  309. {
  310. return ( PdfToText::$Utf8Placeholder ) ;
  311. }
  312. else
  313. return ( sprintf ( PdfToText::$Utf8Placeholder, $code ) ) ;
  314. }
  315. }
  316.  
  317.  
  318. /*--------------------------------------------------------------------------------------------------------------
  319.  
  320. DecodeRawName -
  321. Decodes a string that may contain constructs such as '#xy', where 'xy' are hex digits.
  322.  
  323. *-------------------------------------------------------------------------------------------------------------*/
  324. public static function DecodeRawName ( $str )
  325. {
  326. return ( rawurldecode ( str_replace ( '#', '%', $str ) ) ) ;
  327. }
  328.  
  329.  
  330. /*--------------------------------------------------------------------------------------------------------------
  331.  
  332. NAME
  333. GetEncodingType - Gets an object encoding type.
  334.  
  335. PROTOTYPE
  336. $type = $this -> GetEncodingType ( $object_id, $object_data ) ;
  337.  
  338. DESCRIPTION
  339. When an object is a stream, returns its encoding type.
  340.  
  341. PARAMETERS
  342. $object_id (integer) -
  343. PDF object number.
  344.  
  345. $object_data (string) -
  346. Object contents.
  347.  
  348. RETURN VALUE
  349. Returns one of the following values :
  350.  
  351. - PdfToText::PDF_ASCIIHEX_ENCODING :
  352. Hexadecimal encoding of the binary values.
  353. Decoding algorithm was taken from the unknown contributor and not tested so far, since I
  354. couldn't find a PDF file with such an encoding type.
  355.  
  356. - PdfToText::PDF_ASCII85_ENCODING :
  357. Obscure encoding format.
  358. Decoding algorithm was taken from the unknown contributor and not tested so far, since I
  359. couldn't find a PDF file with such an encoding type.
  360.  
  361. - PdfToText::PDF_FLATE_ENCODING :
  362. gzip/deflate encoding.
  363.  
  364. - PdfToText::PDF_TEXT_ENCODING :
  365. Stream data is unencoded (ie, it is pure ascii).
  366.  
  367. - PdfToText::PDF_UNKNOWN_ENCODING :
  368. The object data does not specify any encoding at all. It can happen on objects that do not have
  369. a "stream" part.
  370.  
  371. - PdfToText::PDF_DCT_ENCODING :
  372. a lossy filter based on the JPEG standard.
  373.  
  374. The following constants are defined but not yet implemented ; an exception will be thrown if they are
  375. encountered somewhere in the PDF file :
  376.  
  377. - PDF_LZW_ENCODING :
  378. a filter based on LZW Compression; it can use one of two groups of predictor functions for more
  379. compact LZW compression : Predictor 2 from the TIFF 6.0 specification and predictors (filters)
  380. from the PNG specification
  381.  
  382. - PDF_RLE_ENCODING :
  383. a simple compression method for streams with repetitive data using the run-length encoding
  384. algorithm and the image-specific filters.
  385.  
  386. PDF_CCITT_FAX_ENCODING :
  387. a lossless bi-level (black/white) filter based on the Group 3 or Group 4 CCITT (ITU-T) fax
  388. compression standard defined in ITU-T T.4 and T.6.
  389.  
  390. PDF_JBIG2_ENCODING :
  391. a lossy or lossless bi-level (black/white) filter based on the JBIG2 standard, introduced in
  392. PDF 1.4.
  393.  
  394. PDF_JPX_ENCODING :
  395. a lossy or lossless filter based on the JPEG 2000 standard, introduced in PDF 1.5.
  396.  
  397. *-------------------------------------------------------------------------------------------------------------*/
  398. protected function GetEncodingType ( $object_id, $object_data )
  399. {
  400. $status = preg_match ( '# / (?P<encoding> (ASCIIHexDecode) | (AHx) | (ASCII85Decode) | (A85) | (FlateDecode) | (Fl) | (DCTDecode) | (DCT) | ' .
  401. '(LZWDecode) | (LZW) | (RunLengthDecode) | (RL) | (CCITTFaxDecode) | (CCF) | (JBIG2Decode) | (JPXDecode) ) \b #imsx',
  402. $object_data, $match ) ;
  403.  
  404. if ( ! $status )
  405. return ( self::PDF_TEXT_ENCODING ) ;
  406.  
  407. switch ( strtolower ( $match [ 'encoding' ] ) )
  408. {
  409. case 'asciihexdecode' :
  410. case 'ahx' : return ( self::PDF_ASCIIHEX_ENCODING ) ;
  411.  
  412. case 'ascii85decode' :
  413. case 'a85' : return ( self::PDF_ASCII85_ENCODING ) ;
  414.  
  415. case 'flatedecode' :
  416. case 'fl' : return ( self::PDF_FLATE_ENCODING ) ;
  417.  
  418. case 'dctdecode' :
  419. case 'dct' : return ( self::PDF_DCT_ENCODING ) ;
  420.  
  421. case 'lzwdecode' :
  422. case 'lzw' : return ( self::PDF_LZW_ENCODING ) ;
  423.  
  424. case 'ccittfaxdecode' :
  425. case 'ccf' :
  426.  
  427. case 'runlengthdecode' :
  428. case 'rl' :
  429.  
  430. case 'jbig2decode' :
  431.  
  432. case 'jpxdecode' :
  433. if ( PdfToText::$DEBUG > 1 )
  434. warning ( "Encoding type \"{$match [ 'encoding' ]}\" not yet implemented for pdf object #$object_id." ) ;
  435.  
  436. default : return ( self::PDF_UNKNOWN_ENCODING ) ;
  437. }
  438. }
  439.  
  440.  
  441. /*--------------------------------------------------------------------------------------------------------------
  442.  
  443. NAME
  444. GetObjectReferences - Gets object references from a specified construct.
  445.  
  446. PROTOTYPE
  447. $status = $this -> GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) ;
  448.  
  449. DESCRIPTION
  450. Certain parameter specifications are followed by an object reference of the form :
  451. x 0 R
  452. but it can also be an array of references :
  453. [x1 0 R x2 0 R ... xn 0 r]
  454. Those kind of constructs can occur after parameters such as : /Pages, /Contents, /Kids...
  455. This method extracts the object references found in such a construct.
  456.  
  457. PARAMETERS
  458. $object_id (integer) -
  459. Id of the object to be analyzed.
  460.  
  461. $object_data (string) -
  462. Object contents.
  463.  
  464. $searched_string (string) -
  465. String to be searched, that must be followed by an object or an array of object references.
  466. This parameter can contain constructs used in regular expressions. Note however that the '#'
  467. character must be escaped, since it is used as a delimiter in the regex that is applied on
  468. object data.
  469.  
  470. $object_ids (array of integers) -
  471. Returns on output the ids of the pdf object that have been found after the searched string.
  472.  
  473. RETURN VALUE
  474. True if the searched string has been found and is followed by an object or array of object references,
  475. false otherwise.
  476.  
  477. *-------------------------------------------------------------------------------------------------------------*/
  478. protected function GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids )
  479. {
  480. $status = true ;
  481. $object_ids = array ( ) ;
  482.  
  483. if ( preg_match ( "#$searched_string \s* \\[ (?P<objects> [^\]]+ ) \\]#ix", $object_data, $match ) )
  484. {
  485. $object_list = $match [ 'objects' ] ;
  486.  
  487. if ( preg_match_all ( '/(?P<object> \d+) \s+ \d+ \s+ R/x', $object_list, $matches ) )
  488. {
  489. foreach ( $matches [ 'object' ] as $id )
  490. $object_ids [] = ( integer ) $id ;
  491. }
  492. else
  493. $status = false ;
  494. }
  495. else if ( preg_match ( "#$searched_string \s+ (?P<object> \d+) \s+ \d+ \s+ R#ix", $object_data, $match ) )
  496. {
  497. $object_ids [] = ( integer ) $match [ 'object' ] ;
  498. }
  499. else
  500. $status = false ;
  501.  
  502. return ( $status ) ;
  503. }
  504.  
  505.  
  506. /*--------------------------------------------------------------------------------------------------------------
  507.  
  508. NAME
  509. GetStringParameter - Retrieve a string flag value.
  510.  
  511. PROTOTYPE
  512. $result = $this -> GetStringParameter ( $parameter, $object_data ) ;
  513.  
  514. DESCRIPTION
  515. Retrieves the value of a string parameter ; for example :
  516.  
  517. /U (parameter value)
  518.  
  519. or :
  520.  
  521. /U <hexdigits>
  522.  
  523. PARAMETERS
  524. $parameter (string) -
  525. Parameter name.
  526.  
  527. $object_data (string) -
  528. Object containing the parameter.
  529.  
  530. RETURN VALUE
  531. The parameter value.
  532.  
  533. NOTES
  534. description
  535.  
  536. *-------------------------------------------------------------------------------------------------------------*/
  537. protected function GetStringParameter ( $parameter, $object_data )
  538. {
  539. if ( preg_match ( '#' . $parameter . ' \s* \( \s* (?P<value> [^)]+) \)#ix', $object_data, $match ) )
  540. $result = $this -> ProcessEscapedString ( $match [ 'value' ] ) ;
  541. else if ( preg_match ( '#' . $parameter . ' \s* \< \s* (?P<value> [^>]+) \>#ix', $object_data, $match ) )
  542. {
  543. $hexdigits = $match [ 'value' ] ;
  544. $result = '' ;
  545.  
  546. for ( $i = 0, $count = strlen ( $hexdigits ) ; $i < $count ; $i += 2 )
  547. $result .= chr ( hexdec ( substr ( $hexdigits, $i, 2 ) ) ) ;
  548. }
  549. else
  550. $result = '' ;
  551.  
  552. return ( $result ) ;
  553. }
  554.  
  555.  
  556. /*--------------------------------------------------------------------------------------------------------------
  557.  
  558. GetUTCDate -
  559. Reformats an Adobe UTC date to a format that can be understood by the strtotime() function.
  560. Dates are specified in the following format :
  561. D:20150521154000Z
  562. D:20160707182114+02
  563. with are both recognized by strtotime(). However, another format can be specified :
  564. D:20160707182114+02'00'
  565. which is not recognized by strtotime() so we have to get rid from the '00' part.
  566.  
  567. *-------------------------------------------------------------------------------------------------------------*/
  568. protected function GetUTCDate ( $date )
  569. {
  570. if ( $date )
  571. {
  572. if ( ( $date [0] == 'D' || $date [0] == 'd' ) && $date [1] == ':' )
  573. $date = substr ( $date, 2 ) ;
  574.  
  575. if ( ( $index = strpos ( $date, "'" ) ) !== false )
  576. $date = substr ( $date, 0, $index ) ;
  577. }
  578.  
  579. return ( $date ) ;
  580. }
  581.  
  582.  
  583. /*--------------------------------------------------------------------------------------------------------------
  584.  
  585. IsCharacterMap -
  586. Checks if the specified text contents represent a character map definition or not.
  587.  
  588. *-------------------------------------------------------------------------------------------------------------*/
  589. protected function IsCharacterMap ( $decoded_data )
  590. {
  591. // preg_match is faster than calling strpos several times
  592. return ( preg_match ( '#(begincmap)|(beginbfrange)|(beginbfchar)|(/Differences)#ix', $decoded_data ) ) ;
  593. }
  594.  
  595.  
  596. /*--------------------------------------------------------------------------------------------------------------
  597.  
  598. IsFont -
  599. Checks if the current object contents specify a font declaration.
  600.  
  601. *-------------------------------------------------------------------------------------------------------------*/
  602. protected function IsFont ( $object_data )
  603. {
  604. return
  605. (
  606. stripos ( $object_data, '/BaseFont' ) !== false ||
  607. ( ! preg_match ( '#/Type \s* /FontDescriptor#ix', $object_data ) &&
  608. preg_match ( '#/Type \s* /Font#ix', $object_data ) )
  609. ) ;
  610. }
  611.  
  612.  
  613. /*--------------------------------------------------------------------------------------------------------------
  614.  
  615. IsFormData -
  616. Checks if the current object contents specify references to font data.
  617.  
  618. *-------------------------------------------------------------------------------------------------------------*/
  619. protected function IsFormData ( $object_data )
  620. {
  621. return
  622. (
  623. preg_match ( '#\bR \s* \( \s* datasets \s* \)#imsx', $object_data )
  624. ) ;
  625. }
  626.  
  627.  
  628. /*--------------------------------------------------------------------------------------------------------------
  629.  
  630. IsFontMap -
  631. Checks if the code contains things like :
  632. <</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
  633. which maps font 1 (when specified with the /Fx instruction) to object 26, 2 to object 22 and 3 to
  634. object 18, respectively, in the above example.
  635.  
  636. *-------------------------------------------------------------------------------------------------------------*/
  637. protected function IsFontMap ( $object_data )
  638. {
  639. $object_data = self::UnescapeHexCharacters ( $object_data ) ;
  640.  
  641. if ( preg_match ( '#<< \s* ( ' . self::$FontSpecifiers . ' ) \s+ .* >>#imsx', $object_data ) )
  642. return ( true ) ;
  643. else
  644. return ( false ) ;
  645. }
  646.  
  647.  
  648. /*--------------------------------------------------------------------------------------------------------------
  649.  
  650. IsImage -
  651. Checks if the code contains things like :
  652. /Subtype/Image
  653.  
  654. *-------------------------------------------------------------------------------------------------------------*/
  655. protected function IsImage ( $object_data )
  656. {
  657. if ( preg_match ( '#/Subtype \s* /Image#msx', $object_data ) )
  658. return ( true ) ;
  659. else
  660. return ( false ) ;
  661. }
  662.  
  663.  
  664. /*--------------------------------------------------------------------------------------------------------------
  665.  
  666. IsObjectStream -
  667. Checks if the code contains an object stream (/Type/ObjStm)
  668. /Subtype/Image
  669.  
  670. *-------------------------------------------------------------------------------------------------------------*/
  671. protected function IsObjectStream ( $object_data )
  672. {
  673. if ( preg_match ( '#/Type \s* /ObjStm#isx', $object_data ) )
  674. return ( true ) ;
  675. else
  676. return ( false ) ;
  677. }
  678.  
  679.  
  680. /*--------------------------------------------------------------------------------------------------------------
  681.  
  682. NAME
  683. IsPageHeaderOrFooter - Check if the specified object contents denote a text stream.
  684.  
  685. PROTOTYPE
  686. $status = $this -> IsPageHeaderOrFooter ( $stream_data ) ;
  687.  
  688. DESCRIPTION
  689. Checks if the specified decoded stream contents denotes header or footer data.
  690.  
  691. PARAMETERS
  692. $stream_data (string) -
  693. Decoded stream contents.
  694.  
  695. *-------------------------------------------------------------------------------------------------------------*/
  696. protected function IsPageHeaderOrFooter ( $stream_data )
  697. {
  698. if ( preg_match ( '#/Type \s* /Pagination \s* /Subtype \s*/((Header)|(Footer))#ix', $stream_data ) )
  699. return ( true ) ;
  700. else if ( preg_match ( '#/Attached \s* \[ .*? /((Top)|(Bottom)) [^]]#ix', $stream_data ) )
  701. return ( true ) ;
  702. else
  703. return ( false ) ;
  704. }
  705.  
  706.  
  707. /*--------------------------------------------------------------------------------------------------------------
  708.  
  709. NAME
  710. IsText - Check if the specified object contents denote a text stream.
  711.  
  712. PROTOTYPE
  713. $status = $this -> IsText ( $object_data, $decoded_stream_data ) ;
  714.  
  715. DESCRIPTION
  716. Checks if the specified object contents denote a text stream.
  717.  
  718. PARAMETERS
  719. $object_data (string) -
  720. Object data, ie the contents located between the "obj" and "endobj" keywords.
  721.  
  722. $decoded_stream_data (string) -
  723. The flags specified in the object data are not sufficient to be sure that we have a block of
  724. drawing instructions. We must also check for certain common instructions to be present.
  725.  
  726. RETURN VALUE
  727. True if the specified contents MAY be text contents, false otherwise.
  728.  
  729. NOTES
  730. I do not consider this method as bullet-proof. There may arise some cases where non-text blocks can be
  731. mistakenly considered as text blocks, so it is subject to evolve in the future.
  732.  
  733. *-------------------------------------------------------------------------------------------------------------*/
  734. protected function IsText ( $object_data, $decoded_stream_data )
  735. {
  736. if ( preg_match ( '# / (Filter) | (Length) #ix', $object_data ) &&
  737. ! preg_match ( '# / (Type) | (Subtype) | (Length1) #ix', $object_data ) )
  738. {
  739. if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
  740. return ( true ) ;
  741. }
  742. else if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
  743. return ( true ) ;
  744.  
  745. return ( false ) ;
  746. }
  747.  
  748.  
  749. /*--------------------------------------------------------------------------------------------------------------
  750.  
  751. NAME
  752. PregStrReplace - Replace string(s) using regular expression(s)
  753.  
  754. PROTOTYPE
  755. $result = PdfToText::PregStrReplace ( $pattern, $replacement, $subject, $limit = -1,
  756. &$match_count = null )
  757.  
  758. DESCRIPTION
  759. This function behaves like a mix of str_replace() and preg_replace() ; it allows to search for strings
  760. using regular expressions, but the replacements are plain-text strings and no reference to a capture
  761. specified in the regular expression will be interpreted.
  762. This is useful when processing templates, which can contain constructs such as "\00" or "$", which are
  763. interpreted by preg_replace() as references to captures.
  764.  
  765. The function has the same parameters as preg_replace().
  766.  
  767. RETURN VALUE
  768. Returns the substituted text.
  769.  
  770. *-------------------------------------------------------------------------------------------------------------*/
  771. public static function PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, &$match_count = null )
  772. {
  773. // Make sure that $pattern and $replacement become arrays of the same size
  774. if ( is_array ( $pattern ) )
  775. {
  776. if ( is_array ( $replacement ) )
  777. {
  778. if ( count ( $pattern ) !== count ( $replacement ) )
  779. {
  780. warning ( "The \$replacement parameter should have the same number of element as \$pattern." ) ;
  781. return ( $subject ) ;
  782. }
  783. }
  784. else
  785. $replacement = array_fill ( $replacement, count ( $pattern ), $replacement ) ;
  786. }
  787. else
  788. {
  789. if ( is_array ( $replacement ) )
  790. {
  791. warning ( "Expected string for the \$replacement parameter." ) ;
  792. return ( $subject ) ;
  793. }
  794.  
  795. $pattern = array ( $pattern ) ;
  796. $replacement = array ( $replacement ) ;
  797. }
  798.  
  799. // Upper limit
  800. if ( $limit < 1 )
  801. $limit = PHP_INT_MAX ;
  802.  
  803. // Loop through each supplied pattern
  804. $current_subject = $subject ;
  805. $count = 0 ;
  806.  
  807. for ( $i = 0, $pattern_count = count ( $pattern ) ; $i < $pattern_count ; $i ++ )
  808. {
  809. $regex = $pattern [$i] ;
  810.  
  811. // Get all matches for this pattern
  812. if ( preg_match_all ( $regex, $current_subject, $matches, PREG_OFFSET_CAPTURE ) )
  813. {
  814. $result = '' ; // Current output result
  815. $last_offset = 0 ;
  816.  
  817. // Process each match
  818. foreach ( $matches [0] as $match )
  819. {
  820. $offset = ( integer ) $match [1] ;
  821.  
  822. // Append data from the last seen offset up to the current one
  823. if ( $last_offset < $offset )
  824. $result .= substr ( $current_subject, $last_offset, $offset - $last_offset ) ;
  825.  
  826. // Append the replacement string for this match
  827. $result .= $replacement [$i] ;
  828.  
  829. // Compute next offset in $current_subject
  830. $last_offset = $offset + strlen ( $match [0] ) ;
  831.  
  832. // Limit checking
  833. $count ++ ;
  834.  
  835. if ( $count > $limit )
  836. break 2 ;
  837. }
  838.  
  839. // Append the last part of the subject that has not been matched by anything
  840. $result .= substr ( $current_subject, $last_offset ) ;
  841.  
  842. // The current subject becomes the string that has been built in the steps above
  843. $current_subject = $result ;
  844. }
  845. }
  846.  
  847. /// All done, return
  848. return ( $current_subject ) ;
  849. }
  850.  
  851.  
  852. /*--------------------------------------------------------------------------------------------------------------
  853.  
  854. NAME
  855. ProcessEscapedCharacter - Interprets a character after a backslash in a string.
  856.  
  857. PROTOTYPE
  858. $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
  859.  
  860. DESCRIPTION
  861. Interprets a character after a backslash in a string and returns the interpreted value.
  862.  
  863. PARAMETERS
  864. $ch (char) -
  865. Character to be escaped.
  866.  
  867. RETURN VALUE
  868. The escaped character.
  869.  
  870. NOTES
  871. This method does not process octal sequences.
  872.  
  873. *-------------------------------------------------------------------------------------------------------------*/
  874. protected function ProcessEscapedCharacter ( $ch )
  875. {
  876. switch ( $ch )
  877. {
  878. // Normally, only a few characters should be escaped...
  879. case '(' : $newchar = "(" ; break ;
  880. case ')' : $newchar = ")" ; break ;
  881. case '[' : $newchar = "[" ; break ;
  882. case ']' : $newchar = "]" ; break ;
  883. case '\\' : $newchar = "\\" ; break ;
  884. case 'n' : $newchar = "\n" ; break ;
  885. case 'r' : $newchar = "\r" ; break ;
  886. case 'f' : $newchar = "\f" ; break ;
  887. case 't' : $newchar = "\t" ; break ;
  888. case 'b' : $newchar = chr ( 8 ) ; break ;
  889. case 'v' : $newchar = chr ( 11 ) ; break ;
  890.  
  891. // ... but should we consider that it is a heresy to escape other characters ?
  892. // For the moment, no.
  893. default : $newchar = $ch ; break ;
  894. }
  895.  
  896. return ( $newchar ) ;
  897. }
  898.  
  899.  
  900. /*--------------------------------------------------------------------------------------------------------------
  901.  
  902. NAME
  903. ProcessEscapedString - Processes a string which can have escaped characters.
  904.  
  905. PROTOTYPE
  906. $result = $this -> ProcessEscapedString ( $str, $process_octal_escapes = false ) ;
  907.  
  908. DESCRIPTION
  909. Processes a string which may contain escape sequences.
  910.  
  911. PARAMETERS
  912. $str (string) -
  913. String to be processed.
  914.  
  915. $process_octal_escapes (boolean) -
  916. When true, octal escape sequences such as \037 are processed.
  917.  
  918. RETURN VALUE
  919. The processed input string.
  920.  
  921. *-------------------------------------------------------------------------------------------------------------*/
  922. protected function ProcessEscapedString ( $str, $process_octal_escapes = false )
  923. {
  924. $length = strlen ( $str ) ;
  925. $offset = 0 ;
  926. $result = '' ;
  927. $ord0 = ord ( '0' ) ;
  928.  
  929. while ( ( $backslash_index = strpos ( $str, '\\', $offset ) ) !== false )
  930. {
  931. if ( $backslash_index + 1 < $length )
  932. {
  933. $ch = $str [ ++ $backslash_index ] ;
  934.  
  935. if ( ! $process_octal_escapes )
  936. {
  937. $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
  938. $offset = $backslash_index + 1 ;
  939. }
  940. else if ( $ch < '0' || $ch > '7' )
  941. {
  942. $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
  943. $offset = $backslash_index + 1 ;
  944. }
  945. else
  946. {
  947. $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) ;
  948. $ord = ord ( $ch ) - $ord0 ;
  949. $count = 0 ;
  950. $backslash_index ++ ;
  951.  
  952. while ( $backslash_index < $length && $count < 2 &&
  953. $str [ $backslash_index ] >= '0' && $str [ $backslash_index ] <= '7' )
  954. {
  955. $ord = ( $ord * 8 ) + ( ord ( $str [ $backslash_index ++ ] ) - $ord0 ) ;
  956. $count ++ ;
  957. }
  958.  
  959. $result .= chr ( $ord ) ;
  960. $offset = $backslash_index ;
  961. }
  962. }
  963. else
  964. break ;
  965. }
  966.  
  967. $result .= substr ( $str, $offset ) ;
  968.  
  969. return ( $result ) ;
  970. }
  971.  
  972.  
  973. /*--------------------------------------------------------------------------------------------------------------
  974.  
  975. NAME
  976. Unescape - Processes escape sequences from the specified string.
  977.  
  978. PROTOTYPE
  979. $value = $this -> Unescape ( $text ) ;
  980.  
  981. DESCRIPTION
  982. Processes escape sequences within the specified text. The recognized escape sequences are like the
  983. C-language ones : \b (backspace), \f (form feed), \r (carriage return), \n (newline), \t (tab).
  984. All other characters prefixed by "\" are returned as is.
  985.  
  986. PARAMETERS
  987. $text (string) -
  988. Text to be unescaped.
  989.  
  990. RETURN VALUE
  991. Returns the unescaped value of $text.
  992.  
  993. *-------------------------------------------------------------------------------------------------------------*/
  994. public static function Unescape ( $text )
  995. {
  996. $length = strlen ( $text ) ;
  997. $result = '' ;
  998. $ord0 = ord ( 0 ) ;
  999.  
  1000. for ( $i = 0 ; $i < $length ; $i ++ )
  1001. {
  1002. $ch = $text [$i] ;
  1003.  
  1004. if ( $ch == '\\' && isset ( $text [$i+1] ) )
  1005. {
  1006. $nch = $text [++$i] ;
  1007.  
  1008. switch ( $nch )
  1009. {
  1010. case 'b' : $result .= "\b" ; break ;
  1011. case 't' : $result .= "\t" ; break ;
  1012. case 'f' : $result .= "\f" ; break ;
  1013. case 'r' : $result .= "\r" ; break ;
  1014. case 'n' : $result .= "\n" ; break ;
  1015. default :
  1016. // Octal escape notation
  1017. if ( $nch >= '0' && $nch <= '7' )
  1018. {
  1019. $ord = ord ( $nch ) - $ord0 ;
  1020. $digits = 1 ;
  1021. $i ++ ;
  1022.  
  1023. while ( $i < $length && $digits < 3 && $text [$i] >= '0' && $text [$i] <= '7' )
  1024. {
  1025. $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
  1026. $i ++ ;
  1027. $digits ++ ;
  1028. }
  1029.  
  1030. $i -- ; // Count one character less since $i will be incremented at the end of the for() loop
  1031.  
  1032. $result .= chr ( $ord ) ;
  1033. }
  1034. else
  1035. $result .= $nch ;
  1036. }
  1037. }
  1038. else
  1039. $result .= $ch ;
  1040. }
  1041.  
  1042. return ( $result ) ;
  1043. }
  1044.  
  1045.  
  1046. /*--------------------------------------------------------------------------------------------------------------
  1047.  
  1048. NAME
  1049. UnescapeHexCharacters - Unescapes characters in the #xy notation.
  1050.  
  1051. PROTOTYPE
  1052. $result = $this -> UnescapeHexCharacters ( $data ) ;
  1053.  
  1054. DESCRIPTION
  1055. Some specifications contain hex characters specified as #xy. For the moment, I have met such a construct in
  1056. font aliases such as :
  1057. /C2#5F0 25 0 R
  1058. where "#5F" stands for "_", giving :
  1059. /C2_0 25 0 R
  1060. Hope that such constructs do not happen in other places...
  1061.  
  1062. PARAMETERS
  1063. $data (string) -
  1064. String to be unescaped.
  1065.  
  1066. RETURN VALUE
  1067. The input string with all the hex character representations replaced with their ascii equivalent.
  1068.  
  1069. *-------------------------------------------------------------------------------------------------------------*/
  1070. public static function UnescapeHexCharacters ( $data )
  1071. {
  1072. if ( strpos ( $data, 'stream' ) === false && preg_match ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data ) )
  1073. {
  1074. preg_match_all ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data, $matches ) ;
  1075.  
  1076. $searches = array ( ) ;
  1077. $replacements = array ( ) ;
  1078.  
  1079. foreach ( $matches [ 'hex' ] as $hex )
  1080. {
  1081. if ( ! isset ( $searches [ $hex ] ) )
  1082. {
  1083. $searches [ $hex ] = $hex ;
  1084. $replacements [] = chr ( hexdec ( substr ( $hex, 1 ) ) ) ;
  1085. }
  1086.  
  1087. $data = str_replace ( $searches, $replacements, $data ) ;
  1088. }
  1089. }
  1090.  
  1091. return ( $data ) ;
  1092. }
  1093.  
  1094.  
  1095. /*--------------------------------------------------------------------------------------------------------------
  1096.  
  1097. ValidatePhpName -
  1098. Checks that the specified name (declared in the XML template) is a valid PHP name.
  1099.  
  1100. *-------------------------------------------------------------------------------------------------------------*/
  1101. public static function ValidatePhpName ( $name )
  1102. {
  1103. $name = trim ( $name ) ;
  1104.  
  1105. if ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) )
  1106. error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ;
  1107.  
  1108. return ( $name ) ;
  1109. }
  1110. }
  1111.  
  1112.  
  1113. /*==============================================================================================================
  1114.  
  1115. PdfToText class -
  1116. A class for extracting text from Pdf files.
  1117.  
  1118. ==============================================================================================================*/
  1119. class PdfToText extends PdfObjectBase
  1120. {
  1121. // Current version of the class
  1122. const VERSION = "1.6.7" ;
  1123.  
  1124. // Pdf processing options
  1125. const PDFOPT_NONE = 0x00000000 ; // No extra option
  1126. const PDFOPT_REPEAT_SEPARATOR = 0x00000001 ; // Repeats the Separator property if the offset between two text blocks (in array notation)
  1127. // is greater than $this -> MinSpaceWidth
  1128. const PDFOPT_GET_IMAGE_DATA = 0x00000002 ; // Retrieve raw image data in the $ths -> ImageData array
  1129. const PDFOPT_DECODE_IMAGE_DATA = 0x00000004 ; // Creates a jpeg resource for each image
  1130. const PDFOPT_IGNORE_TEXT_LEADING = 0x00000008 ; // Ignore text leading values
  1131. const PDFOPT_NO_HYPHENATED_WORDS = 0x00000010 ; // Join hyphenated words that are split on two lines
  1132. const PDFOPT_AUTOSAVE_IMAGES = 0x00000020 ; // Autosave images ; the ImageFileTemplate property will need to be defined
  1133. const PDFOPT_ENFORCE_EXECUTION_TIME = 0x00000040 ; // Enforces the max_execution_time PHP setting when processing a file. A PdfTexterTimeoutException
  1134. // will be thrown if processing of a single file reaches (time_limit - 1 second) by default
  1135. // The MaxExecutionTime property can be set to modify this default value.
  1136. const PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME = 0x00000080 ; // Same as PDFOPT_ENFORCE_EXECUTION_TIME, but for all calls to the Load() method of the PdfToText class
  1137. // The MaxGlobalExecutionTime static property can be set to modify the default time limit
  1138. const PDFOPT_IGNORE_HEADERS_AND_FOOTERS = 0x00000300 ; // Ignore headers and footers
  1139.  
  1140. const PDFOPT_RAW_LAYOUT = 0x00000000 ; // Layout rendering : raw (default)
  1141. const PDFOPT_BASIC_LAYOUT = 0x00000400 ; // Layout rendering : basic
  1142.  
  1143. const PDFOPT_LAYOUT_MASK = 0x00000C00 ; // Mask to isolate the targeted layout
  1144.  
  1145. const PDFOPT_ENHANCED_STATISTICS = 0x00001000 ; // Compute statistics on PDF language instructions
  1146. const PDFOPT_DEBUG_SHOW_COORDINATES = 0x00002000 ; // Include text coordinates ; implies the PDFOPT_BASIC_LAYOUT option
  1147. // This option can be useful if you want to use capture areas and get information about
  1148. // their coordinates
  1149. const PDFOPT_CAPTURE = 0x00004000 ; // Indicates that the caller wants to capture some text and use the SetCaptures() method
  1150. // It currently enables the PDFOPT_BASIC_LAYOUT option
  1151. const PDFOPT_LOOSE_X_CAPTURE = 0x00008000 ; // Includes in captures text fragments whose dimensions may exceed the captured area dimensions
  1152. const PDFOPT_LOOSE_Y_CAPTURE = 0x00010000 ; // (currently not used)
  1153.  
  1154. // When boolean true, outputs debug information about fonts, character maps and drawing contents.
  1155. // When integer > 1, outputs additional information about other objects.
  1156. public static $DEBUG = false ;
  1157.  
  1158. // Current filename
  1159. public $Filename = false ;
  1160. // Extracted text
  1161. public $Text = '' ;
  1162. // Document pages (array of strings)
  1163. public $Pages = array ( ) ;
  1164. // Document images (array of PdfImage objects)
  1165. public $Images = array ( ) ;
  1166. protected $ImageCount = 0 ;
  1167. // Raw data for document images
  1168. public $ImageData = array ( ) ;
  1169. // ImageAutoSaveFileTemplate :
  1170. // Template for the file names to be generated when extracting images, if the PDFOPT_AUTOSAVE_IMAGES has been specified.
  1171. // Can contain any path, plus the following printf()-like modifiers :
  1172. // . "%p" : Path of the original PDF file.
  1173. // . "%f" : Filename part of the original PDF file.
  1174. // . "%d" : A sequential number, starting from 1, used when generating filenames. The format can contains a width specifier,
  1175. // such as "%3d", which will generate 3-digits sequential numbers left-filled with zeroes.
  1176. // . "%s" : Image suffix, which will automatically based on the underlying image type.
  1177. public $ImageAutoSaveFileTemplate = "%p/%f.%d.%s" ;
  1178. // Auto-save image file format
  1179. public $ImageAutoSaveFormat = IMG_JPEG ;
  1180. // Auto-saved image file names
  1181. public $AutoSavedImageFiles = array ( ) ;
  1182. // Text chunk separator (used to separate blocks of text specified as an array notation)
  1183. public $BlockSeparator = '' ;
  1184. // Separator used to separate text groups where the offset value is less than -1000 thousands of character units
  1185. // (eg : [(1)-1822(2)] will add a separator between the characters "1" and "2")
  1186. // Note that such values are expressed in thousands of text units and subtracted from the current position. A
  1187. // negative value means adding more space between the two text units it separates.
  1188. public $Separator = ' ' ;
  1189. // Separator to be used between pages in the $Text property
  1190. public $PageSeparator = "\n" ;
  1191. // Minimum value (in 1/1000 of text units) that separates two text chunks that can be considered as a real space
  1192. public $MinSpaceWidth = 200 ;
  1193. // Pdf options
  1194. public $Options = self::PDFOPT_NONE ;
  1195. // Maximum number of pages to extract from the PDF. A zero value means "extract everything"
  1196. // If this number is negative, then the pages to be extract start from the last page. For example, a value of -2
  1197. // extracts the last two pages
  1198. public $MaxSelectedPages = false ;
  1199. // Maximum number of images to be extracted. A value of zero means "extract everything". A non-zero value gives
  1200. // the number of images to extract.
  1201. public $MaxExtractedImages = false ;
  1202. // Location of the CID tables directory
  1203. public static $CIDTablesDirectory ;
  1204. // Loacation of the Font metrics directory, for the Adobe standard 14 fonts
  1205. public static $FontMetricsDirectory ;
  1206. // Standard Adobe font names, and their corresponding file in $FontMetricsDirectory
  1207. public static $AdobeStandardFontMetrics = array
  1208. (
  1209. 'courier' => 'courier.fm',
  1210. 'courier-bold' => 'courierb.fm',
  1211. 'courier-oblique' => 'courieri.fm',
  1212. 'courier-boldoblique' => 'courierbi.fm',
  1213. 'helvetica' => 'helvetica.fm',
  1214. 'helvetica-bold' => 'helveticab.fm',
  1215. 'helvetica-oblique' => 'helveticai.fm',
  1216. 'helvetica-boldoblique' => 'helveticabi.fm',
  1217. 'symbol' => 'symbol.fm',
  1218. 'times-roman' => 'times.fm',
  1219. 'times-bold' => 'timesb.fm',
  1220. 'times-bolditalic' => 'timesbi.fm',
  1221. 'times-italic' => 'timesi.fm',
  1222. 'zapfdingbats' => 'zapfdingbats.fm'
  1223. ) ;
  1224. // Author information
  1225. public $Author = '' ;
  1226. public $CreatorApplication = '' ;
  1227. public $ProducerApplication = '' ;
  1228. public $CreationDate = '' ;
  1229. public $ModificationDate = '' ;
  1230. public $Title = '' ;
  1231. public $Subject = '' ;
  1232. public $Keywords = '' ;
  1233. protected $GotAuthorInformation = false ;
  1234. // Unique and arbitrary file identifier, as specified in the PDF file
  1235. // Well, in fact, there are two IDs, but the PDF specification does not mention the goal of the second one
  1236. public $ID = '' ;
  1237. public $ID2 = '' ;
  1238. // End of line string
  1239. public $EOL = PHP_EOL ;
  1240. // String to be used when no Unicode translation is possible
  1241. public static $Utf8Placeholder = '' ;
  1242. // Information about memory consumption implied by the file currently being loaded
  1243. public $MemoryUsage,
  1244. $MemoryPeakUsage ;
  1245. // Offset of the document start (%PDF-x.y)
  1246. public $DocumentStartOffset ;
  1247. // Debug statistics
  1248. public $Statistics = array ( ) ;
  1249. // Max execution time settings. A positive value means "don't exceed that number of seconds".
  1250. // A negative value means "Don't exceed PHP setting max_execution_time - that number of seconds". If the result
  1251. // is negative, then the default will be "max_execution_time - 1".
  1252. // For those limits to be enforced, you need to specify either the PDFOPT_ENFORCE_EXECUTION_TIME or
  1253. // PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME options, or both
  1254. public $MaxExecutionTime = -1 ;
  1255. public static $MaxGlobalExecutionTime = -1 ;
  1256. // This property is expressed in percents ; it gives the extra percentage to add to the values computed by
  1257. // the PdfTexterFont::GetStringWidth() method.
  1258. // This is basically used when computing text positions and string lengths with the PDFOPT_BASIC_LAYOUT option :
  1259. // the computed string length is shorter than its actual length (because of extra spacing determined by character
  1260. // kerning in the font data). To determine whether two consecutive blocks of text should be separated by a space,
  1261. // we empirically add this extra percentage to the computed string length. The default is -5%.
  1262. public $ExtraTextWidth = -5 ;
  1263.  
  1264. // Marker stuff. The unprocessed marker list is a sequential array of markers, which will later be dispatched into
  1265. // indexed arrays during their first reference
  1266. protected $UnprocessedMarkerList = array ( 'font' => array ( ) ) ;
  1267. protected $TextWithFontMarkers = array ( ) ;
  1268.  
  1269. // Internal variables used when the PDFOPT_ENFORCE_* options are specified
  1270. protected static $PhpMaxExecutionTime ;
  1271. protected static $GlobalExecutionStartTime ;
  1272. protected static $AllowedGlobalExecutionTime ;
  1273. protected $ExecutionStartTime ;
  1274. protected $AllowedExecutionTime ;
  1275.  
  1276. // Font mappings
  1277. protected $FontTable = false ;
  1278. // Extra Adobe standard font mappings (for character names of the form "/axxx" for example)
  1279. protected $AdobeExtraMappings = array ( ) ;
  1280. // Page map object
  1281. protected $PageMap ;
  1282. // Page locations (start and end offsets)
  1283. protected $PageLocations ;
  1284. // Encryption data
  1285. public $IsEncrypted = false ;
  1286. protected $EncryptionData = false ;
  1287. // A flag coming from the constructor options, telling if enhanced statistics are enabled
  1288. protected $EnhancedStatistics ;
  1289.  
  1290. // Document text fragments, with their absolute (x,y) position, approximate width and height
  1291. protected $DocumentFragments ;
  1292.  
  1293. // Form data
  1294. protected $FormData ;
  1295. protected $FormDataObjectNumbers ;
  1296. protected $FormDataDefinitions ;
  1297. protected $FormaDataObjects ;
  1298.  
  1299. // Capture data
  1300. public $CaptureDefinitions ;
  1301. protected $CaptureObject ;
  1302.  
  1303. // Indicates whether global static initializations have been made
  1304. // This is mainly used for variables such as $Utf8PlaceHolder, which is initialized to a different value
  1305. private static $StaticInitialized = false ;
  1306.  
  1307. // Drawing instructions that are to be ignored and removed from a text stream before processing, for performance
  1308. // reasons (it is faster to call preg_replace() once to remove them than calling the __next_instruction() and
  1309. // __next_token() methods to process an input stream containing such useless instructions)
  1310. // This is an array of regular expressions where the following constructs are replaced at runtime during static
  1311. // initialization :
  1312. // %n - Will be replaced with a regex matching a decimal number.
  1313. private static $IgnoredInstructionTemplatesLayout = array
  1314. (
  1315. '%n{6} ( (c) ) \s+',
  1316. '%n{4} ( (re) | (y) | (v) | (k) | (K) ) \s+',
  1317. '%n{3} ( (scn) | (SCN) | (r) | (rg) | (RG) | (sc) | (SC) ) \s+',
  1318. '%n{2} ( (m) | (l) ) \s+',
  1319. '%n ( (w) | (M) | (g) | (G) | (J) | (j) | (d) | (i) | (sc) | (SC) | (Tc) | (Tw) | (scn) | (Tr) | (Tz) | (Ts) ) \s+',
  1320. '\b ( (BDC) | (EMC) ) \s+',
  1321. '\/( (Cs \d+) | (CS \d+) | (G[Ss] \d+) | (Fm \d+) | (Im \d+) | (PlacedGraphic) ) \s+ \w+ \s*',
  1322. '\/( (Span) | (Artifact) | (Figure) | (P) ) \s* << .*? >> [ \t\r\n>]*',
  1323. '\/ ( (PlacedGraphic) | (Artifact) ) \s+',
  1324. '\d+ \s+ ( (scn) | (SCN) )',
  1325. '\/MC \d+ \s+',
  1326. '^ \s* [fhS] \r? \n',
  1327. '^W \s+ n \r? \n',
  1328. '(f | W) \* \s+',
  1329. '^[fhnS] \s+',
  1330. '-?0 (\. \d+)? \s+ T[cw]',
  1331. '\bBI \s+ .*? \bID \s+ .*? \bEI',
  1332. '\/ \w+ \s+ ( (cs) | (CS) | (ri) | (gs) )',
  1333. // Hazardous replaces ?
  1334. '( [Ww] \s+ ){3,}',
  1335. ' \[\] \s+ [Shs] \s+'
  1336. ) ;
  1337. // Additional instructions to be stripped when no particular page layout has been requested
  1338. private static $IgnoredInstructionTemplatesNoLayout = array
  1339. (
  1340. '%n{6} ( (cm) ) \s+',
  1341. // '\b ( (BT) | (ET) ) \s+',
  1342. '^ \s* [Qq] \r? \n',
  1343. '^ \s* (\b [a-zA-Z] \s+)+',
  1344. '\s* (\b [a-zA-Z] \s+)+$',
  1345. '^[qQ] \s+',
  1346. '^q \s+ [hfS] \n',
  1347. '( [Qfhnq] \s+ ){2,}'
  1348. ) ;
  1349. // Replacement regular expressions for %something constructs specified in the $IgnoredInstructions array
  1350. private static $ReplacementConstructs = array
  1351. (
  1352. '%n' => '( [+\-]? ( ( [0-9]+ ( \. [0-9]* )? ) | ( \. [0-9]+ ) ) \s+ )'
  1353. ) ;
  1354. // The final regexes that are built during static initialization by the __build_ignored_instructions() method
  1355. private static $IgnoredInstructionsNoLayout = array ( ) ;
  1356. private static $IgnoredInstructionsLayout = array ( ) ;
  1357. private $IgnoredInstructions = array ( ) ;
  1358.  
  1359. // Map id buffer - for avoiding unneccesary calls to GetFontByMapId
  1360. private $MapIdBuffer = array ( ) ;
  1361.  
  1362. // Same for MapCharacter()
  1363. private $CharacterMapBuffer = array ( ) ;
  1364.  
  1365. // Font objects buffer - used by __assemble_text_fragments()
  1366. private $FontObjectsBuffer = array ( ) ;
  1367.  
  1368. // Regex used for removing hyphens - we have to take care of different line endings : "\n" for Unix, "\r\n"
  1369. // for Windows, and "\r" for pure Mac files.
  1370. // Note that we replace an hyphen followed by an end-of-line then by non-space characters with the non-space
  1371. // characters, so the word gets joined on the same line. Spaces after the end of the word (on the next line)
  1372. // are removed, in order for the next word to appear at the beginning of the second line.
  1373. private static $RemoveHyphensRegex = '#
  1374. (
  1375. -
  1376. [ \t]* ( (\r\n) | \n | \r )+ [ \t\r\n]*
  1377. )
  1378. ([^ \t\r\n]+)
  1379. \s*
  1380. #msx' ;
  1381.  
  1382. // A small list of Unicode character ranges that are related to languages written from right to left
  1383. // For performance reasons, everythings is mapped to a range here, even if it includes codepoints that do not map to anything
  1384. // (this class is not a Unicode codepoint validator, but a Pdf text extractor...)
  1385. // The UTF-16 version is given as comments ; only the UTF-8 translation is used here
  1386. // To be completed !
  1387. private static $RtlCharacters = array
  1388. (
  1389. // This range represents the following languages :
  1390. // - Hebrew (0590..05FF)
  1391. // - Arabic (0600..06FF)
  1392. // - Syriac (0700..074F)
  1393. // - Supplement for Arabic (0750..077F)
  1394. // - Thaana (0780..07BF)
  1395. // - N'ko (07C0..07FF)
  1396. // - Samaritan (0800..083F)
  1397. // - Mandaic (0840..085F)
  1398. // array ( 0x00590, 0x0085F ),
  1399. // Hebrew supplement (I suppose ?) + other characters
  1400. // array ( 0x0FB1D, 0x0FEFC ),
  1401. // Mende kikakui
  1402. // array ( 0x1E800, 0x1E8DF ),
  1403. // Adlam
  1404. // array ( 0x1E900, 0x1E95F ),
  1405. // Others
  1406. // array ( 0x10800, 0x10C48 ),
  1407. // array ( 0x1EE00, 0x1EEBB )
  1408. "\xD6" => array ( array ( "\x90", "\xBF" ) ),
  1409. "\xD7" => array ( array ( "\x80", "\xBF" ) ),
  1410. "\xD8" => array ( array ( "\x80", "\xBF" ) ),
  1411. "\xD9" => array ( array ( "\x80", "\xBF" ) ),
  1412. "\xDA" => array ( array ( "\x80", "\xBF" ) ),
  1413. "\xDB" => array ( array ( "\x80", "\xBF" ) ),
  1414. "\xDC" => array ( array ( "\x80", "\xBF" ) ),
  1415. "\xDD" => array ( array ( "\x80", "\xBF" ) ),
  1416. "\xDE" => array ( array ( "\x80", "\xBF" ) ),
  1417. "\xDF" => array ( array ( "\x80", "\xBF" ) )
  1418. /*
  1419. "\xE0" => array
  1420. (
  1421. array ( "\xA0\x80", "\xA0\xBF" ),
  1422. array ( "\xA1\x80", "\xA1\x9F" )
  1423. ),
  1424. "\xEF" => array
  1425. (
  1426. array ( "\xAC\x9D", "\xAC\xBF" ),
  1427. array ( "\xAD\x80", "\xAD\xBF" ),
  1428. array ( "\xAE\x80", "\xAE\xBF" ),
  1429. array ( "\xAF\x80", "\xAF\xBF" ),
  1430. array ( "\xB0\x80", "\xB0\xBF" ),
  1431. array ( "\xB1\x80", "\xB1\xBF" ),
  1432. array ( "\xB2\x80", "\xB2\xBF" ),
  1433. array ( "\xB3\x80", "\xB3\xBF" ),
  1434. array ( "\xB4\x80", "\xB4\xBF" ),
  1435. array ( "\xB5\x80", "\xB5\xBF" ),
  1436. array ( "\xB6\x80", "\xB6\xBF" ),
  1437. array ( "\xB7\x80", "\xB7\xBF" ),
  1438. array ( "\xB8\x80", "\xB8\xBF" ),
  1439. array ( "\xB9\x80", "\xB9\xBF" ),
  1440. array ( "\xBA\x80", "\xBA\xBF" ),
  1441. array ( "\xBB\x80", "\xBB\xBC" )
  1442. )
  1443. */
  1444. ) ;
  1445.  
  1446. // UTF-8 prefixes for RTL characters as keys, and number of characters that must follow the prefix as values
  1447. private static $RtlCharacterPrefixLengths = array
  1448. (
  1449. "\xD6" => 1,
  1450. "\xD7" => 1,
  1451. "\xD8" => 1,
  1452. "\xD9" => 1,
  1453. "\xDA" => 1,
  1454. "\xDB" => 1,
  1455. "\xDC" => 1,
  1456. "\xDE" => 1,
  1457. "\xDF" => 1
  1458. /*
  1459. "\xE0" => 2,
  1460. "\xEF" => 2
  1461. */
  1462. ) ;
  1463.  
  1464. // A string that contains all the RTL character prefixes above
  1465. private static $RtlCharacterPrefixes ;
  1466.  
  1467. // As usual, caching a little bit the results of the IsRtlCharacter() method is welcome. Each item will have the value true if the
  1468. // character is RTL, or false if LTR.
  1469. private $RtlCharacterBuffer = array ( ) ;
  1470.  
  1471. // A subset of a character classification array that avoids too many calls to the ctype_* functions or too many
  1472. // character comparisons.
  1473. // This array is used only for highly sollicited parts of code
  1474. const CTYPE_ALPHA = 0x01 ; // Letter
  1475. const CTYPE_DIGIT = 0x02 ; // Digit
  1476. const CTYPE_XDIGIT = 0x04 ; // Hex digit
  1477. const CTYPE_ALNUM = 0x08 ; // Letter or digit
  1478. const CTYPE_LOWER = 0x10 ; // Lower- or upper-case letters
  1479. const CTYPE_UPPER = 0x20 ;
  1480.  
  1481. private static $CharacterClasses = false ;
  1482.  
  1483. // Stuff specific to the current PHP version
  1484. private static $HasMemoryGetUsage ;
  1485. private static $HasMemoryGetPeakUsage ;
  1486.  
  1487.  
  1488. /*--------------------------------------------------------------------------------------------------------------
  1489.  
  1490. CONSTRUCTOR
  1491. $pdf = new PdfToText ( $filename = null, $options = PDFOPT_NONE ) ;
  1492.  
  1493. DESCRIPTION
  1494. Builds a PdfToText object and optionally loads the specified file's contents.
  1495.  
  1496. PARAMETERS
  1497. $filename (string) -
  1498. Optional PDF filename whose text contents are to be extracted.
  1499.  
  1500. $options (integer) -
  1501. A combination of PDFOPT_* flags. This can be any of the following :
  1502.  
  1503. - PDFOPT_REPEAT_SEPARATOR :
  1504. Text constructs specified as an array are separated by an offset which is expressed as
  1505. thousands of text units ; for example :
  1506.  
  1507. [(1)-2000(2)]
  1508.  
  1509. will be rendered as the text "1 2" ("1" and "2" being separated by two spaces) if the
  1510. "Separator" property is set to a space (the default) and this flag is specified.
  1511. When not specified, the text will be rendered as "1 2".
  1512.  
  1513. - PDFOPT_NONE :
  1514. None of the above options will apply.
  1515.  
  1516. *-------------------------------------------------------------------------------------------------------------*/
  1517. public function __construct ( $filename = null, $options = self::PDFOPT_NONE, $user_password = false, $owner_password = false )
  1518. {
  1519. // We need the mbstring PHP extension here...
  1520. if ( ! function_exists ( 'mb_convert_encoding' ) )
  1521. error ( "You must enable the mbstring PHP extension to use this class." ) ;
  1522.  
  1523. // Perform static initializations if needed
  1524. if ( ! self::$StaticInitialized )
  1525. {
  1526. if ( self::$DEBUG )
  1527. {
  1528. // In debug mode, initialize the utf8 placeholder only if it still set to its default value, the empty string
  1529. if ( self::$Utf8Placeholder == '' )
  1530. self::$Utf8Placeholder = '[Unknown character : 0x%08X]' ;
  1531. }
  1532.  
  1533. // Build the list of regular expressions from the list of ignored instruction templates
  1534. self::__build_ignored_instructions ( ) ;
  1535.  
  1536. // Check if some functions are supported or not
  1537. self::$HasMemoryGetUsage = function_exists ( 'memory_get_usage' ) ;
  1538. self::$HasMemoryGetPeakUsage = function_exists ( 'memory_get_peak_usage' ) ;
  1539.  
  1540. // Location of the directory containing CID fonts
  1541. self::$CIDTablesDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'CIDTables' ;
  1542. self::$FontMetricsDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'FontMetrics' ;
  1543.  
  1544. // The string that contains all the Rtl character prefixes in UTF-8 - An optimization used by the __rtl_process() method
  1545. self::$RtlCharacterPrefixes = implode ( '', array_keys ( self::$RtlCharacterPrefixLengths ) ) ;
  1546.  
  1547. // Build the character classes (used only for testing letters and digits)
  1548. if ( self::$CharacterClasses === false )
  1549. {
  1550. for ( $ord = 0 ; $ord < 256 ; $ord ++ )
  1551. {
  1552. $ch = chr ( $ord ) ;
  1553.  
  1554. if ( $ch >= '0' && $ch <= '9' )
  1555. self::$CharacterClasses [ $ch ] = self::CTYPE_DIGIT | self::CTYPE_XDIGIT | self::CTYPE_ALNUM ;
  1556. else if ( $ch >= 'A' && $ch <= 'Z' )
  1557. {
  1558. self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_UPPER | self::CTYPE_ALNUM ;
  1559.  
  1560. if ( $ch <= 'F' )
  1561. self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ;
  1562. }
  1563. else if ( $ch >= 'a' && $ch <= 'z' )
  1564. {
  1565. self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_LOWER | self::CTYPE_ALNUM ;
  1566.  
  1567. if ( $ch <= 'f' )
  1568. self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ;
  1569. }
  1570. else
  1571. self::$CharacterClasses [ $ch ] = 0 ;
  1572. }
  1573. }
  1574.  
  1575. // Global execution time limit
  1576. self::$PhpMaxExecutionTime = ( integer ) ini_get ( 'max_execution_time' ) ;
  1577.  
  1578. if ( ! self::$PhpMaxExecutionTime ) // Paranoia : default max script execution time to 120 seconds
  1579. self::$PhpMaxExecutionTime = 120 ;
  1580.  
  1581. self::$GlobalExecutionStartTime = microtime ( true ) ; // Set the start of the first execution
  1582.  
  1583. if ( self::$MaxGlobalExecutionTime > 0 )
  1584. self::$AllowedGlobalExecutionTime = self::$MaxGlobalExecutionTime ;
  1585. else
  1586. self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime + self::$MaxGlobalExecutionTime ;
  1587.  
  1588. // Adjust in case of inconsistent values
  1589. if ( self::$AllowedGlobalExecutionTime < 0 || self::$AllowedGlobalExecutionTime > self::$PhpMaxExecutionTime )
  1590. self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime - 1 ;
  1591.  
  1592. self::$StaticInitialized = true ;
  1593. }
  1594.  
  1595. parent::__construct ( ) ;
  1596.  
  1597. $this -> Options = $options ;
  1598.  
  1599. if ( $filename )
  1600. $this -> Load ( $filename, $user_password, $owner_password ) ;
  1601. }
  1602.  
  1603.  
  1604. public function __tostring ( )
  1605. { return ( $this -> Text ) ; }
  1606.  
  1607.  
  1608. /**************************************************************************************************************
  1609. **************************************************************************************************************
  1610. **************************************************************************************************************
  1611. ****** ******
  1612. ****** ******
  1613. ****** PUBLIC METHODS ******
  1614. ****** ******
  1615. ****** ******
  1616. **************************************************************************************************************
  1617. **************************************************************************************************************
  1618. **************************************************************************************************************/
  1619.  
  1620. /*--------------------------------------------------------------------------------------------------------------
  1621.  
  1622. NAME
  1623. Load - Loads text contents from a PDF file.
  1624. LoadFromString - Loads PDF contents from a string.
  1625.  
  1626. PROTOTYPE
  1627. $text = $pdf -> Load ( $filename, $user_password = false, $owner_password = false ) ;
  1628. $text = $pdf -> LoadFromString ( $contents, $user_password = false, $owner_password = false ) ;
  1629.  
  1630. DESCRIPTION
  1631. The Load() method extracts text contents from the specified PDF file. Once processed, text contents will
  1632. be available through the "Text" property.
  1633. The LoadFromString() method performs the same operation on PDF contents already loaded into memory.
  1634.  
  1635. PARAMETERS
  1636. $filename (string) -
  1637. Optional PDF filename whose text contents are to be extracted.
  1638.  
  1639. $contents (string) -
  1640. String containing PDF contents.
  1641.  
  1642. $user_password (string) -
  1643. User password used for decrypting PDF contents.
  1644.  
  1645. $owner_password (string) -
  1646. Owner password.
  1647.  
  1648. *-------------------------------------------------------------------------------------------------------------*/
  1649. private $__memory_peak_usage_start,
  1650. $__memory_usage_start ;
  1651.  
  1652. public function Load ( $filename, $user_password = false, $owner_password = false )
  1653. {
  1654. $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
  1655. $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
  1656.  
  1657. // Check if the file exists, but only if the file is on a local filesystem
  1658. if ( ! preg_match ( '#^ [^:]+ ://#ix', $filename ) && ! file_exists ( $filename ) )
  1659. error ( new PdfToTextDecodingException ( "File \"$filename\" does not exist." ) ) ;
  1660.  
  1661. // Load its contents
  1662. $contents = @file_get_contents ( $filename, FILE_BINARY ) ;
  1663.  
  1664. if ( $contents === false )
  1665. error ( new PdfToTextDecodingException ( "Unable to open \"$filename\"." ) ) ;
  1666.  
  1667. return ( $this -> __load ( $filename, $contents, $user_password, $owner_password ) ) ;
  1668. }
  1669.  
  1670.  
  1671. public function LoadFromString ( $contents, $user_password = false, $owner_password = false )
  1672. {
  1673. $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
  1674. $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
  1675.  
  1676. return ( $this -> __load ( '', $contents, $user_password, $owner_password ) ) ;
  1677. }
  1678.  
  1679.  
  1680. private function __load ( $filename, $contents, $user_password = false, $owner_password = false )
  1681. {
  1682. // Search for the start of the document ("%PDF-x.y")
  1683. $start_offset = strpos ( $contents, '%PDF' ) ;
  1684.  
  1685. if ( $start_offset === false ) // Not a pdf document !
  1686. error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
  1687. else // May be a PDF document
  1688. $this -> DocumentStartOffset = $start_offset ;
  1689.  
  1690. // Check that this is a PDF file with a valid version number
  1691. if ( ! preg_match ( '/ %PDF- (?P<version> \d+ (\. \d+)*) /ix', $contents, $match, 0, $start_offset ) )
  1692. error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
  1693.  
  1694. $this -> PdfVersion = $match [ 'version' ] ;
  1695.  
  1696. // Initializations
  1697. $this -> Text = '' ;
  1698. $this -> FontTable = new PdfTexterFontTable ( ) ;
  1699. $this -> Filename = realpath ( $filename ) ;
  1700. $this -> Pages = array ( ) ;
  1701. $this -> Images = array ( ) ;
  1702. $this -> ImageData = array ( ) ;
  1703. $this -> ImageCount = 0 ;
  1704. $this -> AutoSavedImageFiles = array ( ) ;
  1705. $this -> PageMap = new PdfTexterPageMap ( ) ;
  1706. $this -> PageLocations = array ( ) ;
  1707. $this -> Author = '' ;
  1708. $this -> CreatorApplication = '' ;
  1709. $this -> ProducerApplication = '' ;
  1710. $this -> CreationDate = '' ;
  1711. $this -> ModificationDate = '' ;
  1712. $this -> Title = '' ;
  1713. $this -> Subject = '' ;
  1714. $this -> Keywords = '' ;
  1715. $this -> GotAuthorInformation = false ;
  1716. $this -> ID = '' ;
  1717. $this -> ID2 = '' ;
  1718. $this -> EncryptionData = false ;
  1719. $this -> EnhancedStatistics = ( ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) != 0 ) ;
  1720.  
  1721. // Also reset cached information that may come from previous runs
  1722. $this -> MapIdBuffer = array ( ) ;
  1723. $this -> RtlCharacterBuffer = array ( ) ;
  1724. $this -> CharacterMapBuffer = array ( ) ;
  1725. $this -> FontObjectsBuffer = array ( ) ;
  1726. $this -> FormData = array ( ) ;
  1727. $this -> FormDataObjectNumbers = false ;
  1728. $this -> FomDataDefinitions = array ( ) ;
  1729. $this -> FormDataObjects = array ( ) ;
  1730. $this -> CaptureDefinitions = false ;
  1731. $this -> CaptureObject = false ;
  1732. $this -> DocumentFragments = array ( ) ;
  1733.  
  1734. // Enable the PDFOPT_BASIC_LAYOUT option if the PDFOPT_CAPTURE flag is specified
  1735. if ( $this -> Options & self::PDFOPT_CAPTURE )
  1736. $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
  1737.  
  1738. // Enable the PDFOPT_BASIC_LAYOUT_OPTION is PDFOPT_DEBUG_SHOW_COORDINATES is specified
  1739. if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
  1740. $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
  1741.  
  1742. // Page layout options needs more instructions to be retained - select the appropriate list of useless instructions
  1743. if ( $this -> Options & self::PDFOPT_BASIC_LAYOUT )
  1744. $this -> IgnoredInstructions = self::$IgnoredInstructionsLayout ;
  1745. else
  1746. $this -> IgnoredInstructions = self::$IgnoredInstructionsNoLayout ;
  1747.  
  1748.  
  1749. // Debug statistics
  1750. $this -> Statistics = array
  1751. (
  1752. 'TextSize' => 0, // Total size of drawing instructions ("text" objects)
  1753. 'OptimizedTextSize' => 0, // Optimized text size, with useless instructions removed
  1754. 'Distributions' => array // Statistics about handled instructions distribution - Works only with the page layout option in debug mode
  1755. (
  1756. 'operand' => 0,
  1757. 'Tm' => 0,
  1758. 'Td' => 0,
  1759. 'TD' => 0,
  1760. "'" => 0,
  1761. 'TJ' => 0,
  1762. 'Tj' => 0,
  1763. 'Tf' => 0,
  1764. 'TL' => 0,
  1765. 'T*' => 0,
  1766. '(' => 0,
  1767. '<' => 0,
  1768. '[' => 0,
  1769. 'cm' => 0,
  1770. 'BT' => 0,
  1771. 'template' => 0,
  1772. 'ignored' => 0,
  1773. 'space' => 0
  1774. )
  1775. ) ;
  1776.  
  1777. // Per-instance execution time limit
  1778. $this -> ExecutionStartTime = microtime ( true ) ;
  1779.  
  1780. if ( $this -> MaxExecutionTime > 0 )
  1781. $this -> AllowedExecutionTime = $this -> MaxExecutionTime ;
  1782. else
  1783. $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime + $this -> MaxExecutionTime ;
  1784.  
  1785. // Adjust in case of inconsistent values
  1786. if ( $this -> AllowedExecutionTime < 0 || $this -> AllowedExecutionTime > self::$PhpMaxExecutionTime )
  1787. $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime - 1 ;
  1788.  
  1789. // Systematically set the DECODE_IMAGE_DATA flag if the AUTOSAVE_IMAGES flag has been specified
  1790. if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES )
  1791. $this -> Options |= self::PDFOPT_DECODE_IMAGE_DATA ;
  1792.  
  1793. // Systematically set the GET_IMAGE_DATA flag if DECODE_IMAGE_DATA is specified (debug mode only)
  1794. if ( self::$DEBUG && $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA )
  1795. $this -> Options |= self::PDFOPT_GET_IMAGE_DATA ;
  1796.  
  1797. // Since page layout options take 2 bits, but not all of the 4 possible values are allowed, make sure that an invalid
  1798. // value will default to PDFOPT_RAW_LAYOUT value
  1799. $layout_option = $this -> Options & self::PDFOPT_LAYOUT_MASK ;
  1800.  
  1801. if ( ! $layout_option === self::PDFOPT_RAW_LAYOUT && $layout_option !== self::PDFOPT_BASIC_LAYOUT )
  1802. {
  1803. $layout_option = self::PDFOPT_RAW_LAYOUT ;
  1804. $this -> Options = ( $this -> Options & ~self::PDFOPT_LAYOUT_MASK ) | self::PDFOPT_RAW_LAYOUT ;
  1805. }
  1806.  
  1807. // Author information needs to be processed after, because it may reference objects that occur later in the PDF stream
  1808. $author_information_object_id = false ;
  1809.  
  1810. // Extract pdf objects that are enclosed by the "obj" and "endobj" keywords
  1811. $pdf_objects = array ( ) ;
  1812. $contents_offset = $this -> DocumentStartOffset ;
  1813. $contents_length = strlen ( $contents ) ;
  1814.  
  1815.  
  1816. while ( $contents_offset < $contents_length &&
  1817. preg_match ( '/(?P<re> (?P<object_id> \d+) \s+ \d+ \s+ obj (?P<object> .*?) endobj )/imsx', $contents, $match, PREG_OFFSET_CAPTURE, $contents_offset ) )
  1818. {
  1819. $object_number = $match [ 'object_id' ] [0] ;
  1820. $object_data = $match [ 'object' ] [0] ;
  1821.  
  1822. // Handle the special case of object streams (compound objects)
  1823. // They are not added in the $pdf_objects array, because they could be mistakenly processed as relevant information,
  1824. // such as font definitions, etc.
  1825. // Instead, only the objects they are embedding are stored in this array.
  1826. if ( $this -> IsObjectStream ( $object_data ) )
  1827. {
  1828. // Ignore ill-formed object streams
  1829. if ( ( $object_stream_matches = $this -> DecodeObjectStream ( $object_number, $object_data ) ) !== false )
  1830. {
  1831. // Add this list of objects to the list of known objects
  1832. for ( $j = 0, $object_stream_count = count ( $object_stream_matches [ 'object_id' ] ) ; $j < $object_stream_count ; $j ++ )
  1833. $pdf_objects [ $object_stream_matches [ 'object_id' ] [$j] ] = $object_stream_matches [ 'object' ] [$j] ;
  1834. }
  1835. }
  1836. // Normal (non-compound) object
  1837. else
  1838. $pdf_objects [ $object_number ] = $object_data ;
  1839.  
  1840. // Update current offset through PDF contents
  1841. $contents_offset = $match [ 're' ] [1] + strlen ( $match [ 're' ] [0] ) ;
  1842. }
  1843.  
  1844. // We put a particular attention in treating errors returned by preg_match_all() here, since we need to be really sure why stopped
  1845. // to find further PDF objects in the supplied contents
  1846. $preg_error = preg_last_error ( ) ;
  1847.  
  1848. switch ( $preg_error )
  1849. {
  1850. case PREG_NO_ERROR :
  1851. break ;
  1852.  
  1853. case PREG_INTERNAL_ERROR :
  1854. error ( new PdfToTextDecodingException ( "PDF object extraction : the preg_match_all() function encountered an internal error." ) ) ;
  1855.  
  1856. case PREG_BACKTRACK_LIMIT_ERROR :
  1857. error ( new PdfToTextDecodingException ( "PDF object extraction : backtrack limit reached (you may have to modify the pcre.backtrack_limit " .
  1858. "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.backtrack_limit' ) . ")." ) ) ;
  1859.  
  1860. case PREG_JIT_STACKLIMIT_ERROR :
  1861. error ( new PdfToTextDecodingException ( "PDF object extraction : JIT stack limit reached (you may disable this feature by setting the pcre.jit " .
  1862. "setting of your PHP.ini file to 0)." ) ) ;
  1863.  
  1864. case PREG_RECURSION_LIMIT_ERROR :
  1865. error ( new PdfToTextDecodingException ( "PDF object extraction : recursion limit reached (you may have to modify the pcre.recursion_limit " .
  1866. "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.recursion_limit' ) . ")." ) ) ;
  1867.  
  1868. case PREG_BAD_UTF8_ERROR :
  1869. error ( new PdfToTextDecodingException ( "PDF object extraction : bad UTF8 character encountered." ) ) ;
  1870.  
  1871. case PREG_BAD_UTF8_OFFSET_ERROR :
  1872. error ( new PdfToTextDecodingException ( "PDF object extraction : the specified offset does not start at the beginning of a valid UTF8 codepoint." ) ) ;
  1873.  
  1874. default :
  1875. error ( new PdfToTextDecodingException ( "PDF object extraction : unkown PREG error #$preg_error" ) ) ;
  1876. }
  1877.  
  1878.  
  1879. // Extract trailer information, which may contain the ID of an object specifying encryption flags
  1880. $this -> GetTrailerInformation ( $contents, $pdf_objects ) ;
  1881. unset ( $contents ) ;
  1882.  
  1883. // Character maps encountered so far
  1884. $cmaps = array ( ) ;
  1885.  
  1886. // An array that will store object ids as keys and text contents as values
  1887. $text = array ( ) ;
  1888.  
  1889. // Loop through the objects
  1890. foreach ( $pdf_objects as $object_number => $object_data )
  1891. {
  1892. // Some additional objects may be uncovered after processing (in an object containing compacted objects for example)
  1893. // so add them to the list if necessary
  1894. if ( ! isset ( $pdf_objects [ $object_number ] ) )
  1895. $pdf_objects [ $object_number ] = $object_data ;
  1896.  
  1897. // Try to catch information related to page mapping - but don't discard the object since it can contain additional information
  1898. $this -> PageMap -> Peek ( $object_number, $object_data, $pdf_objects ) ;
  1899.  
  1900. // Check if the object contais authoring information - it can appear encoded or unencoded
  1901. if ( ! $this -> GotAuthorInformation )
  1902. $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $object_data ) ;
  1903.  
  1904. // Also catch the object encoding type
  1905. $type = $this -> GetEncodingType ( $object_number, $object_data ) ;
  1906. $stream_match = null ;
  1907.  
  1908. if ( strpos ( $object_data, 'stream' ) === false ||
  1909. ! preg_match ( '#[^/] stream \s+ (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
  1910. {
  1911. // Some font definitions are in clear text in an object, some are encoded in a stream within the object
  1912. // We process here the unencoded ones
  1913. if ( $this -> IsFont ( $object_data ) )
  1914. {
  1915. $this -> FontTable -> Add ( $object_number, $object_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
  1916. continue ;
  1917. }
  1918. // Some character maps may also be in clear text
  1919. else if ( $this -> IsCharacterMap ( $object_data ) )
  1920. {
  1921. $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $object_data, $this -> AdobeExtraMappings ) ;
  1922.  
  1923. if ( $cmap )
  1924. $cmaps [] = $cmap ;
  1925.  
  1926. continue ;
  1927. }
  1928. // Check if there is an association between font number and object number
  1929. else if ( $this -> IsFontMap ( $object_data ) )
  1930. {
  1931. $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
  1932. }
  1933. // Retrieve form data if present
  1934. else if ( $this -> IsFormData ( $object_data ) )
  1935. {
  1936. $this -> RetrieveFormData ( $object_number, $object_data, $pdf_objects ) ;
  1937. }
  1938. // Ignore other objects that do not contain an encoded stream
  1939. else
  1940. {
  1941. if ( self::$DEBUG > 1 )
  1942. echo "\n----------------------------------- UNSTREAMED #$object_number\n$object_data" ;
  1943.  
  1944. continue ;
  1945. }
  1946. }
  1947. // Extract image data, if any
  1948. else if ( $this -> IsImage ( $object_data ) )
  1949. {
  1950. $this -> AddImage ( $object_number, $stream_match [ 'stream' ], $type, $object_data ) ;
  1951. continue ;
  1952. }
  1953. // Check if there is an association between font number and object number
  1954. else if ( $this -> IsFontMap ( $object_data ) )
  1955. {
  1956. $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
  1957.  
  1958. if ( ! $stream_match )
  1959. continue ;
  1960. }
  1961.  
  1962. // Check if the stream contains data (yes, I have found a sample that had streams of length 0...)
  1963. // In other words : ignore empty streams
  1964. if ( stripos ( $object_data, '/Length 0' ) !== false )
  1965. continue ;
  1966.  
  1967. // Isolate stream data and try to find its encoding type
  1968. if ( isset ( $stream_match [ 'stream' ] ) )
  1969. $stream_data = ltrim ( $stream_match [ 'stream' ], "\r\n" ) ;
  1970. else
  1971. continue ;
  1972.  
  1973. // Ignore this stream if the object does not contain an encoding type (/FLATEDECODE, /ASCIIHEX or /ASCII85)
  1974. if ( $type == self::PDF_UNKNOWN_ENCODING )
  1975. {
  1976. if ( self::$DEBUG > 1 )
  1977. echo "\n----------------------------------- UNENCODED #$object_number :\n$object_data" ;
  1978.  
  1979. continue ;
  1980. }
  1981.  
  1982. // Decode the encoded stream
  1983. $decoded_stream_data = $this -> DecodeData ( $object_number, $stream_data, $type, $object_data ) ;
  1984.  
  1985. // Second chance to peek author information, this time on a decoded stream data
  1986. if ( ! $this -> GotAuthorInformation )
  1987. $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $decoded_stream_data ) ;
  1988.  
  1989. // Check for character maps
  1990. if ( $this -> IsCharacterMap ( $decoded_stream_data ) )
  1991. {
  1992. $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $decoded_stream_data, $this -> AdobeExtraMappings ) ;
  1993.  
  1994. if ( $cmap )
  1995. $cmaps [] = $cmap ;
  1996. }
  1997. // Font definitions
  1998. else if ( $this -> IsFont ( $decoded_stream_data ) )
  1999. {
  2000. $this -> FontTable -> Add ( $object_number, $decoded_stream_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
  2001. }
  2002. // Retrieve form data if present
  2003. else if ( $this -> IsFormData ( $object_data ) )
  2004. {
  2005. $this -> RetrieveFormData ( $object_number, $decoded_stream_data, $pdf_objects ) ;
  2006. }
  2007. // Plain text (well, in fact PDF drawing instructions)
  2008. else if ( $this -> IsText ( $object_data, $decoded_stream_data ) )
  2009. {
  2010. $text_data = false ;
  2011.  
  2012. // Check if we need to ignore page headers and footers
  2013. if ( $this -> Options & self::PDFOPT_IGNORE_HEADERS_AND_FOOTERS )
  2014. {
  2015. if ( ! $this -> IsPageHeaderOrFooter ( $decoded_stream_data ) )
  2016. {
  2017. $text [ $object_number ] =
  2018. $text_data = $decoded_stream_data ;
  2019. }
  2020. // However, they may be mixed with actual text contents so we need to separate them...
  2021. else
  2022. {
  2023. $this -> ExtractTextData ( $object_number, $decoded_stream_data, $remainder, $header, $footer ) ;
  2024.  
  2025. // We still need to check again that the extracted text portion contains something useful
  2026. if ( $this -> IsText ( $object_data, $remainder ) )
  2027. {
  2028. $text [ $object_number ] =
  2029. $text_data = $remainder ;
  2030. }
  2031. }
  2032. }
  2033. else
  2034. {
  2035. $text [ $object_number ] =
  2036. $text_data = $decoded_stream_data ;
  2037. }
  2038.  
  2039.  
  2040. // The current object may be a text object that have been defined as an XObject in some other object
  2041. // In this case, we have to keep it since it may be referenced by a /TPLx construct from within
  2042. // another text object
  2043. if ( $text_data )
  2044. $this -> PageMap -> AddTemplateObject ( $object_number, $text_data ) ;
  2045. }
  2046. // This may be here the opportunity to look into the $FormData property and replace object ids with their corresponding data
  2047. else
  2048. {
  2049. $found = false ;
  2050.  
  2051. foreach ( $this -> FormData as &$form_entry )
  2052. {
  2053. if ( is_integer ( $form_entry [ 'values' ] ) && $object_number == $form_entry [ 'values' ] )
  2054. {
  2055. $form_entry [ 'values' ] = $decoded_stream_data ;
  2056. $found = true ;
  2057. }
  2058. else if ( is_integer ( $form_entry [ 'form' ] ) && $object_number == $form_entry [ 'form' ] )
  2059. {
  2060. $form_entry [ 'form' ] = $decoded_stream_data ;
  2061. $found = true ;
  2062. }
  2063. }
  2064.  
  2065. if ( ! $found && self::$DEBUG > 1 )
  2066. echo "\n----------------------------------- UNRECOGNIZED #$object_number :\n$decoded_stream_data\n" ;
  2067. }
  2068. }
  2069.  
  2070. // Form data object numbers
  2071. $this -> FormDataObjectNumbers = array_keys ( $this -> FormData ) ;
  2072.  
  2073. // Associate character maps with declared fonts
  2074. foreach ( $cmaps as $cmap )
  2075. $this -> FontTable -> AddCharacterMap ( $cmap ) ;
  2076.  
  2077. // Current font defaults to -1, which means : take the first available font as the current one.
  2078. // Sometimes it may happen that text drawing instructions do not set a font at all (PdfPro for example)
  2079. $current_font = -1 ;
  2080.  
  2081. // Build the page catalog
  2082. $this -> Pages = array ( ) ;
  2083. $this -> PageMap -> MapObjects ( $text ) ;
  2084.  
  2085. // Add font mappings local to each page
  2086. $mapped_fonts = $this -> PageMap -> GetMappedFonts ( ) ;
  2087. $this -> FontTable -> AddPageFontMap ( $mapped_fonts ) ;
  2088.  
  2089. // Extract text from the collected text elements
  2090. foreach ( $this -> PageMap -> Pages as $page_number => $page_objects )
  2091. {
  2092. // Checks if this page is selected
  2093. if ( ! $this -> IsPageSelected ( $page_number ) )
  2094. continue ;
  2095.  
  2096. $this -> Pages [ $page_number ] = '' ;
  2097.  
  2098. if ( $layout_option === self::PDFOPT_RAW_LAYOUT )
  2099. {
  2100. foreach ( $page_objects as $page_object )
  2101. {
  2102. if ( isset ( $text [ $page_object ] ) )
  2103. {
  2104. $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
  2105. $object_text = $this -> ExtractText ( $page_number, $page_object, $new_text, $current_font ) ;
  2106. $this -> Pages [ $page_number ] .= $object_text ;
  2107. }
  2108. else if ( self::$DEBUG > 1 )
  2109. echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
  2110. }
  2111. }
  2112. // New style (basic) layout rendering
  2113. else if ( $layout_option === self::PDFOPT_BASIC_LAYOUT )
  2114. {
  2115. $page_fragments = array ( ) ;
  2116.  
  2117. foreach ( $page_objects as $page_object )
  2118. {
  2119. if ( isset ( $text [ $page_object ] ) )
  2120. {
  2121. $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
  2122. $this -> ExtractTextWithLayout ( $page_fragments, $page_number, $page_object, $new_text, $current_font ) ;
  2123. }
  2124. else if ( self::$DEBUG > 1 )
  2125. echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
  2126. }
  2127.  
  2128. $this -> Pages [ $page_number ] = $this -> __assemble_text_fragments ( $page_number, $page_fragments, $page_width, $page_height ) ;
  2129.  
  2130. $this -> DocumentFragments [ $page_number ] = array
  2131. (
  2132. 'fragments' => $page_fragments,
  2133. 'page-width' => $page_width,
  2134. 'page_height' => $page_height
  2135. ) ;
  2136. }
  2137. }
  2138.  
  2139. // Retrieve author information
  2140. if ( $this -> GotAuthorInformation )
  2141. $this -> RetrieveAuthorInformation ( $author_information_object_id, $pdf_objects ) ;
  2142.  
  2143. // Build the page locations (ie, starting and ending offsets)
  2144. $offset = 0 ;
  2145. $page_separator = utf8_encode ( $this -> PageSeparator ) ;
  2146. $page_separator_length = strlen ( $page_separator ) ;
  2147.  
  2148. foreach ( $this -> Pages as $page_number => &$page )
  2149. {
  2150. // If hyphenated words are unwanted, then remove them
  2151. if ( $this -> Options & self::PDFOPT_NO_HYPHENATED_WORDS )
  2152. $page = preg_replace ( self::$RemoveHyphensRegex, '$4$2', $page ) ;
  2153.  
  2154. $length = strlen ( $page ) ;
  2155. $this -> PageLocations [ $page_number ] = array ( 'start' => $offset, 'end' => $offset + $length - 1 ) ;
  2156. $offset += $length + $page_separator_length ;
  2157. }
  2158.  
  2159. // And finally, the Text property
  2160. $this -> Text = implode ( $page_separator, $this -> Pages ) ;
  2161.  
  2162. // Free memory
  2163. $this -> MapIdBuffer = array ( ) ;
  2164. $this -> RtlCharacterBuffer = array ( ) ;
  2165. $this -> CharacterMapBuffer = array ( ) ;
  2166.  
  2167. // Compute memory occupied for this file
  2168. $memory_usage_end = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ;
  2169. $memory_peak_usage_end = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ;
  2170.  
  2171. $this -> MemoryUsage = $memory_usage_end - $this -> __memory_usage_start ;
  2172. $this -> MemoryPeakUsage = $memory_peak_usage_end - $this -> __memory_peak_usage_start ;
  2173.  
  2174. // Adjust the "Distributions" statistics
  2175. if ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS )
  2176. {
  2177. $instruction_count = 0 ;
  2178. $statistics = array ( ) ;
  2179.  
  2180. // Count the total number of instructions
  2181. foreach ( $this -> Statistics [ 'Distributions' ] as $count )
  2182. $instruction_count += $count ;
  2183.  
  2184. // Now transform the Distributions entries into an associative array containing the instruction counts
  2185. // ('count') and their relative percentage
  2186. foreach ( $this -> Statistics [ 'Distributions' ] as $name => $count )
  2187. {
  2188. if ( $instruction_count )
  2189. $percent = round ( ( 100.0 / $instruction_count ) * $count, 2 ) ;
  2190. else
  2191. $percent = 0 ;
  2192.  
  2193. $statistics [ $name ] = array
  2194. (
  2195. 'instruction' => $name,
  2196. 'count' => $count,
  2197. 'percent' => $percent
  2198. ) ;
  2199. }
  2200.  
  2201. // Set the new 'Distributions' array and sort it by instruction count in reverse order
  2202. $this -> Statistics [ 'Distributions' ] = $statistics ;
  2203. uksort ( $this -> Statistics [ 'Distributions' ], array ( $this, '__sort_distributions' ) ) ;
  2204. }
  2205.  
  2206. // All done, return
  2207. return ( $this -> Text ) ;
  2208. }
  2209.  
  2210.  
  2211. public function __sort_distributions ( $a, $b )
  2212. { return ( $this -> Statistics [ 'Distributions' ] [$b] [ 'count' ] - $this -> Statistics [ 'Distributions' ] [$a] [ 'count' ] ) ; }
  2213.  
  2214.  
  2215.  
  2216. /*--------------------------------------------------------------------------------------------------------------
  2217.  
  2218. NAME
  2219. AddAdobeExtraMappings - Adds extra mappings for standard Adobe fonts.
  2220.  
  2221. PROTOTYPE
  2222. $pdf -> AddAdobeExtraMappings ( $mappings ) ;
  2223.  
  2224. DESCRIPTION
  2225. Adobe supports 4 predefined fonts : standard, Mac, WinAnsi and PDF). All the characters in these fonts
  2226. are identified by a character time, a little bit like HTML entities ; for example, 'one' will be the
  2227. character '1', 'acircumflex' will be 'â', etc.
  2228. There are thousands of character names defined by Adobe (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html).
  2229. Some of them are not in this list ; this is the case for example of the 'ax' character names, where 'x'
  2230. is a decimal number. When such a character is specified in a /Differences array, then there is somewhere
  2231. a CharProc[] array giving an object id for each of those characters.
  2232. The referenced object(s) in turn contain drawing instructions to draw the glyph. At no point you could
  2233. guess what is the corresponding Unicode character for this glyph, since the information is not contained
  2234. in the PDF file.
  2235. The AddAdobeExtraMappings() method allows you to specify such correspondences. Specify an array as the
  2236. $mappings parameter, whose keys are the Adobe character name (for example, "a127") and values the
  2237. corresponding Unicode values (see the description of the $mappings parameter for more information).
  2238.  
  2239. PARAMETERS
  2240. $mappings (associative array) -
  2241. Associative array whose keys are Adobe character names. The array values can take several forms :
  2242. - A character
  2243. - An integer value
  2244. - An array of up to four character or integer values.
  2245. Internally, every specified value is converted to an array of four integer values, one for
  2246. each of the standard Adobe character sets (Standard, Mac, WinAnsi and PDF). The following
  2247. rules apply :
  2248. - If the input value is a single character, the output array corrsponding the Adobe character
  2249. name will be a set of 4 elements corresponding to the ordinal value of the supplied
  2250. character.
  2251. - If the input value is an integer, the output array will be a set of 4 identical values
  2252. - If the input value is an array :
  2253. . Arrays with less that 4 elements will be padded, using the last array item for padding
  2254. . Arrays with more than 4 elements will be silently truncated
  2255. . Each array value can either be a character or a numeric value.
  2256.  
  2257. NOTES
  2258. In this current implementation, the method applies the mappings to ALL Adobe default fonts. That is,
  2259. you cannot have one mapping for one Adobe font referenced in the PDF file, then a second mapping for
  2260. a second Adobe font, etc.
  2261.  
  2262. *-------------------------------------------------------------------------------------------------------------*/
  2263. public function AddAdobeExtraMappings ( $mappings )
  2264. {
  2265. // Loop through each mapping
  2266. foreach ( $mappings as $key => $value )
  2267. {
  2268. // Character value : we retain its ordinal value as the 4 values of the output array
  2269. if ( is_string ( $value ) )
  2270. {
  2271. $ord = ord ( $value ) ;
  2272. $items = array ( $ord, $ord, $ord, $ord ) ;
  2273. }
  2274. // Numeric value : the output array will contain 4 times the supplied value
  2275. else if ( is_numeric ( $value ) )
  2276. {
  2277. $value = ( integer ) $value ;
  2278. $items = array ( $value, $value, $value, $value ) ;
  2279. }
  2280. // Array value : make sure we will have an output array of 4 values
  2281. else if ( is_array ( $value ) )
  2282. {
  2283. $items = array ( ) ;
  2284.  
  2285. // Collect the supplied values, converting characters to their ordinal values if necessary
  2286. for ( $i = 0, $count = count ( $value ) ; $i < $count && $i < 4 ; $i ++ )
  2287. {
  2288. $code = $value [$i] ;
  2289.  
  2290. if ( is_string ( $code ) )
  2291. $items [] = ord ( $code ) ;
  2292. else
  2293. $items [] = ( integer ) $code ;
  2294. }
  2295.  
  2296. // Ensure that we have 4 values ; fill the missing ones with the last seen value if necessary
  2297. $count = count ( $items ) ;
  2298.  
  2299. if ( ! $count )
  2300. error ( new PdfToTextException ( "Adobe extra mapping \"$key\" has no values." ) ) ;
  2301.  
  2302. $last_value = $items [ $count - 1 ] ;
  2303.  
  2304. for ( $i = $count ; $i < 4 ; $i ++ )
  2305. $items [] = $last_value ;
  2306. }
  2307. else
  2308. error ( new PdfToTextException ( "Invalid value \"$value\" for Adobe extra mapping \"$key\"." ) ) ;
  2309.  
  2310. // Add this current mapping to the Adobe extra mappings array
  2311. $this -> AdobeExtraMappings [ $key ] = $items ;
  2312. }
  2313. }
  2314.  
  2315.  
  2316. /*--------------------------------------------------------------------------------------------------------------
  2317.  
  2318. NAME
  2319. GetPageFromOffset - Returns a page number from a text offset.
  2320.  
  2321. PROTOTYPE
  2322. $offset = $pdf -> GetPageFromOffset ( $offset ) ;
  2323.  
  2324. DESCRIPTION
  2325. Given a byte offset in the Text property, returns its page number in the pdf document.
  2326.  
  2327. PARAMETERS
  2328. $offset (integer) -
  2329. Offset, in the Text property, whose page number is to be retrieved.
  2330.  
  2331. RETURN VALUE
  2332. Returns a page number in the pdf document, or false if the specified offset does not exist.
  2333.  
  2334. *-------------------------------------------------------------------------------------------------------------*/
  2335. public function GetPageFromOffset ( $offset )
  2336. {
  2337. if ( $offset === false )
  2338. return ( false ) ;
  2339.  
  2340. foreach ( $this -> PageLocations as $page => $location )
  2341. {
  2342. if ( $offset >= $location [ 'start' ] && $offset <= $location [ 'end' ] )
  2343. return ( $page ) ;
  2344. }
  2345.  
  2346. return ( false ) ;
  2347. }
  2348.  
  2349.  
  2350. /*--------------------------------------------------------------------------------------------------------------
  2351.  
  2352. NAME
  2353. text_strpos, text_stripos - Search for an occurrence of a string.
  2354.  
  2355. PROTOTYPE
  2356. $result = $pdf -> text_strpos ( $search, $start = 0 ) ;
  2357. $result = $pdf -> text_stripos ( $search, $start = 0 ) ;
  2358.  
  2359. DESCRIPTION
  2360. These methods behave as the strpos/stripos PHP functions, except that :
  2361. - They operate on the text contents of the pdf file (Text property)
  2362. - They return an array containing the page number and text offset. $result [0] will be set to the page
  2363. number of the searched text, and $result [1] to its offset in the Text property
  2364.  
  2365. PARAMETERS
  2366. $search (string) -
  2367. String to be searched.
  2368.  
  2369. $start (integer) -
  2370. Start offset in the pdf text contents.
  2371.  
  2372. RETURN VALUE
  2373. Returns an array of two values containing the page number and text offset if the searched string has
  2374. been found, or false otherwise.
  2375.  
  2376. *-------------------------------------------------------------------------------------------------------------*/
  2377. public function text_strpos ( $search, $start = 0 )
  2378. {
  2379. $offset = mb_strpos ( $this -> Text, $search, $start, 'UTF-8' ) ;
  2380.  
  2381. if ( $offset !== false )
  2382. return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
  2383.  
  2384. return ( false ) ;
  2385. }
  2386.  
  2387.  
  2388. public function text_stripos ( $search, $start = 0 )
  2389. {
  2390. $offset = mb_stripos ( $this -> Text, $search, $start, 'UTF-8' ) ;
  2391.  
  2392. if ( $offset !== false )
  2393. return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
  2394.  
  2395. return ( false ) ;
  2396. }
  2397.  
  2398.  
  2399.  
  2400.  
  2401. /*--------------------------------------------------------------------------------------------------------------
  2402.  
  2403. NAME
  2404. document_strpos, document_stripos - Search for all occurrences of a string.
  2405.  
  2406. PROTOTYPE
  2407. $result = $pdf -> document_strpos ( $search, $group_by_page = false ) ;
  2408. $result = $pdf -> document_stripos ( $search, $group_by_page = false ) ;
  2409.  
  2410. DESCRIPTION
  2411. Searches for ALL occurrences of a given string in the pdf document. The value of the $group_by_page
  2412. parameter determines how the results are returned :
  2413. - When true, the returned value will be an associative array whose keys will be page numbers and values
  2414. arrays of offset of the found string within the page
  2415. - When false, the returned value will be an array of arrays containing two entries : the page number
  2416. and the text offset.
  2417.  
  2418. For example, if a pdf document contains the string "here" at character offset 100 and 200 in page 1, and
  2419. position 157 in page 3, the returned value will be :
  2420. - When $group_by_page is false :
  2421. [ [ 1, 100 ], [ 1, 200 ], [ 3, 157 ] ]
  2422. - When $group_by_page is true :
  2423. [ 1 => [ 100, 200 ], 3 => [ 157 ] ]
  2424.  
  2425. PARAMETERS
  2426. $search (string) -
  2427. String to be searched.
  2428.  
  2429. $group_by_page (boolean) -
  2430. Indicates whether the found offsets should be grouped by page number or not.
  2431.  
  2432. RETURN VALUE
  2433. Returns an array of page numbers/character offsets (see Description above) or false if the specified
  2434. string does not appear in the document.
  2435.  
  2436. *-------------------------------------------------------------------------------------------------------------*/
  2437. public function document_strpos ( $text, $group_by_page = false )
  2438. {
  2439. $length = strlen ( $text ) ;
  2440.  
  2441. if ( ! $length )
  2442. return ( false ) ;
  2443.  
  2444. $result = array ( ) ;
  2445. $index = 0 ;
  2446.  
  2447. while ( ( $index = mb_strpos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false )
  2448. {
  2449. $page = $this -> GetPageFromOffset ( $index ) ;
  2450.  
  2451. if ( $group_by_page )
  2452. $result [ $page ] [] = $index ;
  2453. else
  2454. $result [] = array ( $page, $index ) ;
  2455.  
  2456. $index += $length ;
  2457. }
  2458.  
  2459. return ( $result ) ;
  2460. }
  2461.  
  2462.  
  2463. public function document_stripos ( $text, $group_by_page = false )
  2464. {
  2465. $length = strlen ( $text ) ;
  2466.  
  2467. if ( ! $length )
  2468. return ( false ) ;
  2469.  
  2470. $result = array ( ) ;
  2471. $index = 0 ;
  2472.  
  2473. while ( ( $index = mb_stripos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false )
  2474. {
  2475. $page = $this -> GetPageFromOffset ( $index ) ;
  2476.  
  2477. if ( $group_by_page )
  2478. $result [ $page ] [] = $index ;
  2479. else
  2480. $result [] = array ( $page, $index ) ;
  2481.  
  2482. $index += $length ;
  2483. }
  2484.  
  2485. return ( $result ) ;
  2486. }
  2487.  
  2488.  
  2489. /*--------------------------------------------------------------------------------------------------------------
  2490.  
  2491. NAME
  2492. text_match, document_match - Search string using regular expressions.
  2493.  
  2494. PROTOTYPE
  2495. $status = $pdf -> text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
  2496. $status = $pdf -> document_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
  2497.  
  2498. DESCRIPTION
  2499. text_match() calls the preg_match() PHP function on the pdf text contents, to locate the first occurrence
  2500. of text that matches the specified regular expression.
  2501. document_match() calls the preg_match_all() function to locate all occurrences that match the specified
  2502. regular expression.
  2503. Note that both methods add the PREG_OFFSET_CAPTURE flag when calling preg_match/preg_match_all so you
  2504. should be aware that all captured results are an array containing the following entries :
  2505. - Item [0] is the captured string
  2506. - Item [1] is its text offset
  2507. - The text_match() and document_match() methods add an extra array item (index 2), which contains the
  2508. page number where the matched text resides
  2509.  
  2510. PARAMETERS
  2511. $pattern (string) -
  2512. Regular expression to be searched.
  2513.  
  2514. $match (any) -
  2515. Output captures. See preg_match/preg_match_all.
  2516.  
  2517. $flags (integer) -
  2518. PCRE flags. See preg_match/preg_match_all.
  2519.  
  2520. $offset (integer) -
  2521. Start offset. See preg_match/preg_match_all.
  2522.  
  2523. RETURN VALUE
  2524. Returns the number of matched occurrences, or false if the specified regular expression is invalid.
  2525.  
  2526. *-------------------------------------------------------------------------------------------------------------*/
  2527. public function text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 )
  2528. {
  2529. $local_match = null ;
  2530. $status = preg_match ( $pattern, $this -> Text, $local_match, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
  2531.  
  2532. if ( $status )
  2533. {
  2534. foreach ( $local_match as &$entry )
  2535. $entry [2] = $this -> GetPageFromOffset ( $entry [1] ) ;
  2536.  
  2537. $match = $local_match ;
  2538. }
  2539.  
  2540. return ( $status ) ;
  2541. }
  2542.  
  2543.  
  2544. public function document_match ( $pattern, &$matches = null, $flags = 0, $offset = 0 )
  2545. {
  2546. $local_matches = null ;
  2547. $status = preg_match_all ( $pattern, $this -> Text, $local_matches, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
  2548.  
  2549. if ( $status )
  2550. {
  2551. foreach ( $local_matches as &$entry )
  2552. {
  2553. foreach ( $entry as &$subentry )
  2554. $subentry [2] = $this -> GetPageFromOffset ( $subentry [1] ) ;
  2555. }
  2556.  
  2557. $matches = $local_matches ;
  2558. }
  2559.  
  2560. return ( $status ) ;
  2561. }
  2562.  
  2563.  
  2564. /*--------------------------------------------------------------------------------------------------------------
  2565.  
  2566. HasFormData -
  2567. Returns true if the PDF file contains form data or not.
  2568.  
  2569. *-------------------------------------------------------------------------------------------------------------*/
  2570. public function HasFormData ( )
  2571. {
  2572. return ( count ( $this -> FormData ) > 0 ) ;
  2573. }
  2574.  
  2575.  
  2576. /*--------------------------------------------------------------------------------------------------------------
  2577.  
  2578. GetFormCount -
  2579. Returns the number of top-level forms contained in the PDF file.
  2580.  
  2581. *-------------------------------------------------------------------------------------------------------------*/
  2582. public function GetFormCount ( )
  2583. {
  2584. return ( count ( $this -> FormData ) ) ;
  2585. }
  2586.  
  2587.  
  2588. /*--------------------------------------------------------------------------------------------------------------
  2589.  
  2590. NAME
  2591. GetFormData - Returns form data, if any
  2592.  
  2593. PROTOTYPE
  2594. $object = $pdf -> GetFormData ( $template = null, $form_index = 0 ) ;
  2595.  
  2596. DESCRIPTION
  2597. Retrieves form data if present.
  2598.  
  2599. PARAMETERS
  2600. $template (string) -
  2601. An XML file describing form data using human-readable names for field values.
  2602. If not specified, the inline form definitions will be used, together with the field names
  2603. specified in the PDF file.
  2604.  
  2605. $form_index (integer) -
  2606. Form index in the PDF file. So far, I really don't know if a PDF file can have multiple forms.
  2607.  
  2608. RETURN VALUE
  2609. An object derived from the PdfToTextFormData class.
  2610.  
  2611. *-------------------------------------------------------------------------------------------------------------*/
  2612. public function GetFormData ( $template = null, $form_index = 0 )
  2613. {
  2614. if ( isset ( $this -> FormDataObjects [ $form_index ] ) )
  2615. return ( $this -> FormDataObjects [ $form_index ] ) ;
  2616.  
  2617. if ( $form_index > count ( $this -> FormDataObjectNumbers ) )
  2618. error ( new PdfToTextFormException ( "Invalid form index #$form_index." ) ) ;
  2619.  
  2620. $form_data = $this -> FormData [ $this -> FormDataObjectNumbers [ $form_index ] ] ;
  2621.  
  2622. if ( $template )
  2623. {
  2624. if ( ! file_exists ( $template ) )
  2625. error ( new PdfToTextFormException ( "Form data template file \"$template\" not found." ) ) ;
  2626.  
  2627. $xml_data = file_get_contents ( $template ) ;
  2628. $definitions = new PdfToTextFormDefinitions ( $xml_data, $form_data [ 'form' ] ) ; ;
  2629. }
  2630. else
  2631. {
  2632. $definitions = new PdfToTextFormDefinitions ( null, $form_data [ 'form' ] ) ;
  2633. }
  2634.  
  2635. $object = $definitions [ $form_index ] -> GetFormDataFromPdfObject ( $form_data [ 'values' ] ) ;
  2636.  
  2637. $this -> FormDataDefinitions [] = $definitions ;
  2638. $this -> FormDataObjects [] = $object ;
  2639.  
  2640. return ( $object ) ;
  2641. }
  2642.  
  2643.  
  2644. /*--------------------------------------------------------------------------------------------------------------
  2645.  
  2646. NAME
  2647. MarkTextLike - Marks output text.
  2648.  
  2649. PROTOTYPE
  2650. $pdf -> MarkTextLike ( $regex, $marker_start, $marker_end ) ;
  2651.  
  2652. DESCRIPTION
  2653. Sometimes it may be convenient, when you want to extract only a portion of text, to say : "I want to
  2654. extract text between this title and this title". The MarkTextLike() method provides some support for
  2655. such a task. Imagine you have documents that have the same structure, all starting with an "Introduction"
  2656. title :
  2657.  
  2658. Introduction
  2659. ...
  2660. some text
  2661. ...
  2662. Some other title
  2663. ...
  2664.  
  2665. By calling the MarkTextLike() method such as in the example below :
  2666.  
  2667. $pdf -> MarkTextLike ( '/\bIntroduction\b/', '<M>', '</M' ) ;
  2668.  
  2669. then you will get as output :
  2670.  
  2671. <M>Introduction</M>
  2672. ...
  2673. some text
  2674. ...
  2675. <M>Some other title</M>
  2676.  
  2677. Adding such markers in the output will allow you to easily extract the text between the chapters
  2678. "Introduction" and "Some other title", using a regular expression.
  2679.  
  2680. The font name used for the first string matched by the specified regular expression will be searched
  2681. later to add markers around all the text portions using this font.
  2682.  
  2683.  
  2684. PARAMETERS
  2685. $regex (string) -
  2686. A regular expression to match the text to be matched. Subsequent portions of text using the
  2687. same font will be surrounded by the marker start/end strings.
  2688.  
  2689. $marker_start, $marker_end (string) -
  2690. Markers to surround the string when a match is found.
  2691.  
  2692. *-------------------------------------------------------------------------------------------------------------*/
  2693. public function MarkTextLike ( $regex, $marker_start, $marker_end )
  2694. {
  2695. $this -> UnprocessedMarkerList [ 'font' ] [] = array
  2696. (
  2697. 'regex' => $regex,
  2698. 'start' => $marker_start,
  2699. 'end' => $marker_end
  2700. ) ;
  2701. }
  2702.  
  2703.  
  2704. /*--------------------------------------------------------------------------------------------------------------
  2705.  
  2706. NAME
  2707. SetCaptures, SetCapturesFromString - Defines document parts to be captured.
  2708.  
  2709. PROTOTYPE
  2710. $pdf -> SetCaptures ( $xml_file ) ;
  2711. $pdf -> SetCapturesFromString ( $xml_data ) ;
  2712.  
  2713. DESCRIPTION
  2714. Defines document parts to be captured.
  2715. SetCaptures() takes the definitions for the areas to be captured from an XML file, while
  2716. SetCapturesFromString() takes them from a string representing xml capture definitions.
  2717.  
  2718. NOTES
  2719. - See file README.md for an explanation on the format of the XML capture definition file.
  2720. - The SetCaptures() methods must be called before the Load() method.
  2721.  
  2722. *-------------------------------------------------------------------------------------------------------------*/
  2723. public function SetCaptures ( $xml_file )
  2724. {
  2725. if ( ! file_exists ( $xml_file ) )
  2726. error ( new PdfToTextException ( "File \"$xml_file\" does not exist." ) ) ;
  2727.  
  2728. $xml_data = file_get_contents ( $xml_file ) ;
  2729.  
  2730. $this -> SetCapturesFromString ( $xml_data ) ;
  2731.  
  2732. }
  2733.  
  2734.  
  2735. public function SetCapturesFromString ( $xml_data )
  2736. {
  2737. // Setting capture areas implies having the PDFOPT_BASIC_LAYOUT option
  2738. $this -> Options |= self::PDFOPT_BASIC_LAYOUT ;
  2739.  
  2740. $this -> CaptureDefinitions = new PdfToTextCaptureDefinitions ( $xml_data ) ;
  2741. }
  2742.  
  2743.  
  2744. /*--------------------------------------------------------------------------------------------------------------
  2745.  
  2746. NAME
  2747. GetCaptures - Returns captured data.
  2748.  
  2749. PROTOTYPE
  2750. $object = $pdf -> GetCaptures ( $full = false ) ;
  2751.  
  2752. PARAMETERS
  2753. $full (boolean) -
  2754. When true, the whole captures, togethers with their definitions, are returned. When false,
  2755. only a basic object containing the capture names and their values is returned.
  2756.  
  2757. DESCRIPTION
  2758. Returns the object that contains captured data.
  2759.  
  2760. RETURN VALUE
  2761. An object of type PdfToTextCaptures, or false if an error occurred.
  2762.  
  2763. *-------------------------------------------------------------------------------------------------------------*/
  2764. public function GetCaptures ( $full = false )
  2765. {
  2766. if ( ! $this -> CaptureObject )
  2767. {
  2768. $this -> CaptureDefinitions -> SetPageCount ( count ( $this -> Pages ) ) ;
  2769. $this -> CaptureObject = $this -> CaptureDefinitions -> GetCapturedObject ( $this -> DocumentFragments ) ;
  2770. }
  2771.  
  2772. if ( $full )
  2773. return ( $this -> CaptureObject ) ;
  2774. else
  2775. return ( $this -> CaptureObject -> ToCaptures ( ) ) ;
  2776. }
  2777.  
  2778.  
  2779. /**************************************************************************************************************
  2780. **************************************************************************************************************
  2781. **************************************************************************************************************
  2782. ****** ******
  2783. ****** ******
  2784. ****** INTERNAL METHODS ******
  2785. ****** ******
  2786. ****** ******
  2787. **************************************************************************************************************
  2788. **************************************************************************************************************
  2789. **************************************************************************************************************/
  2790.  
  2791. /*--------------------------------------------------------------------------------------------------------------
  2792.  
  2793. NAME
  2794. AddImage - Adds an image from the PDF stream to the current object.
  2795.  
  2796. PROTOTYPE
  2797. $this -> AddImage ( $object_id, $stream_data, $type, $object_data ) ;
  2798.  
  2799. DESCRIPTION
  2800. Adds an image from the PDF stream to the current object.
  2801. If the PDFOPT_GET_IMAGE_DATA flag is enabled, image data will be added to the ImageData property.
  2802. If the PDFOPT_DECODE_IMAGE_DATA flag is enabled, a jpeg resource will be created and added into the
  2803. Images array property.
  2804.  
  2805. PARAMETERS
  2806. $object_id (integer) -
  2807. Pdf object id.
  2808.  
  2809. $stream_data (string) -
  2810. Contents of the unprocessed stream data containing the image.
  2811.  
  2812. $type (integer) -
  2813. One of the PdfToText::PDF_*_ENCODING constants.
  2814.  
  2815. *-------------------------------------------------------------------------------------------------------------*/
  2816. protected function AddImage ( $object_id, $stream_data, $type, $object_data )
  2817. {
  2818.  
  2819. if ( self::$DEBUG && $this -> Options & self::PDFOPT_GET_IMAGE_DATA )
  2820. {
  2821. switch ( $type )
  2822. {
  2823. case self::PDF_DCT_ENCODING :
  2824. $this -> ImageData = array ( 'type' => 'jpeg', 'data' => $stream_data ) ;
  2825. break ;
  2826. }
  2827.  
  2828. }
  2829.  
  2830.  
  2831. if ( $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA &&
  2832. ( ! $this -> MaxExtractedImages || $this -> ImageCount < $this -> MaxExtractedImages ) )
  2833. {
  2834. $image = $this -> DecodeImage ( $object_id, $stream_data, $type, $object_data, $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) ;
  2835.  
  2836. if ( $image !== false )
  2837. {
  2838. $this -> ImageCount ++ ;
  2839.  
  2840. // When the PDFOPT_AUTOSAVE_IMAGES flag is set, we simply use a template filename to generate a real output filename
  2841. // then save the image to that file. The memory is freed after that.
  2842. if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES )
  2843. {
  2844. $output_filename = $this -> __get_output_image_filename ( ) ;
  2845.  
  2846. $image -> SaveAs ( $output_filename, $this -> ImageAutoSaveFormat ) ;
  2847. unset ( $image ) ;
  2848.  
  2849. $this -> AutoSavedImageFiles [] = $output_filename ;
  2850. }
  2851. // Otherwise, simply store the image data into memory
  2852. else
  2853. $this -> Images [] = $image ;
  2854. }
  2855. }
  2856. }
  2857.  
  2858.  
  2859. /*--------------------------------------------------------------------------------------------------------------
  2860.  
  2861. NAME
  2862. DecodeData - Decodes stream data.
  2863.  
  2864. PROTOTYPE
  2865. $data = $this -> DecodeData ( $object_id, $stream_data, $type ) ;
  2866.  
  2867. DESCRIPTION
  2868. Decodes stream data (binary data located between the "stream" and "enstream" directives) according to the
  2869. specified encoding type, given in the surrounding object parameters.
  2870.  
  2871. PARAMETERS
  2872. $object_id (integer) -
  2873. Id of the object containing the data.
  2874.  
  2875. $stream_data (string) -
  2876. Contents of the binary stream.
  2877.  
  2878. $type (integer) -
  2879. One of the PDF_*_ENCODING constants, as returned by the GetEncodingType() method.
  2880.  
  2881. RETURN VALUE
  2882. Returns the decoded stream data.
  2883.  
  2884. *-------------------------------------------------------------------------------------------------------------*/
  2885. protected function DecodeData ( $object_id, $stream_data, $type, $object_data )
  2886. {
  2887. $decoded_stream_data = '' ;
  2888.  
  2889. switch ( $type )
  2890. {
  2891. case self::PDF_FLATE_ENCODING :
  2892. // Objects in password-protected Pdf files SHOULD be encrypted ; however, it happens that we may encounter normal,
  2893. // unencrypted ones. This is why we always try to gzuncompress them first then, if failed, try to decrypt them
  2894. $decoded_stream_data = @gzuncompress ( $stream_data ) ;
  2895.  
  2896. if ( $decoded_stream_data === false )
  2897. {
  2898. if ( $this -> IsEncrypted )
  2899. {
  2900. $decoded_stream_data = $this -> EncryptionData -> Decrypt ( $object_id, $stream_data ) ;
  2901.  
  2902. if ( $decoded_stream_data === false )
  2903. {
  2904. if ( self::$DEBUG > 1 )
  2905. warning ( new PdfToTextDecodingException ( "Unable to decrypt object contents.", $object_id ) ) ;
  2906. }
  2907. }
  2908. else if ( self::$DEBUG > 1 )
  2909. warning ( new PdfToTextDecodingException ( "Invalid gzip data.", $object_id ) ) ;
  2910. }
  2911.  
  2912. break ;
  2913.  
  2914. case self::PDF_LZW_ENCODING :
  2915. $decoded_stream_data = $this -> __decode_lzw ( $stream_data ) ;
  2916. break ;
  2917.  
  2918. case self::PDF_ASCIIHEX_ENCODING :
  2919. $decoded_stream_data = $this -> __decode_ascii_hex ( $stream_data ) ;
  2920. break ;
  2921.  
  2922. case self::PDF_ASCII85_ENCODING :
  2923. $decoded_stream_data = $this -> __decode_ascii_85 ( $stream_data ) ;
  2924.  
  2925. // Dumbly check if this could not be gzipped data after decoding (normally, the object flags should also specify
  2926. // the /FlateDecode flag)
  2927. if ( $decoded_stream_data !== false && ( $result = @gzuncompress ( $decoded_stream_data ) ) !== false )
  2928. $decoded_stream_data = $result ;
  2929.  
  2930. break ;
  2931.  
  2932. case self::PDF_TEXT_ENCODING :
  2933. $decoded_stream_data = $stream_data ;
  2934. break ;
  2935. }
  2936.  
  2937. return ( $decoded_stream_data ) ;
  2938. }
  2939.  
  2940.  
  2941. // __decode_lzw -
  2942. // Decoding function for LZW encrypted data. This function is largely inspired by the TCPDF one but has been rewritten
  2943. // for a performance gain of 30-35%.
  2944. private function __decode_lzw ( $data )
  2945. {
  2946. // The initial dictionary contains 256 entries where each index is equal to its character representation
  2947. static $InitialDictionary = array
  2948. (
  2949. "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", "\x0E", "\x0F",
  2950. "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F",
  2951. "\x20", "\x21", "\x22", "\x23", "\x24", "\x25", "\x26", "\x27", "\x28", "\x29", "\x2A", "\x2B", "\x2C", "\x2D", "\x2E", "\x2F",
  2952. "\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39", "\x3A", "\x3B", "\x3C", "\x3D", "\x3E", "\x3F",
  2953. "\x40", "\x41", "\x42", "\x43", "\x44", "\x45", "\x46", "\x47", "\x48", "\x49", "\x4A", "\x4B", "\x4C", "\x4D", "\x4E", "\x4F",
  2954. "\x50", "\x51", "\x52", "\x53", "\x54", "\x55", "\x56", "\x57", "\x58", "\x59", "\x5A", "\x5B", "\x5C", "\x5D", "\x5E", "\x5F",
  2955. "\x60", "\x61", "\x62", "\x63", "\x64", "\x65", "\x66", "\x67", "\x68", "\x69", "\x6A", "\x6B", "\x6C", "\x6D", "\x6E", "\x6F",
  2956. "\x70", "\x71", "\x72", "\x73", "\x74", "\x75", "\x76", "\x77", "\x78", "\x79", "\x7A", "\x7B", "\x7C", "\x7D", "\x7E", "\x7F",
  2957. "\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8A", "\x8B", "\x8C", "\x8D", "\x8E", "\x8F",
  2958. "\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9A", "\x9B", "\x9C", "\x9D", "\x9E", "\x9F",
  2959. "\xA0", "\xA1", "\xA2", "\xA3", "\xA4", "\xA5", "\xA6", "\xA7", "\xA8", "\xA9", "\xAA", "\xAB", "\xAC", "\xAD", "\xAE", "\xAF",
  2960. "\xB0", "\xB1", "\xB2", "\xB3", "\xB4", "\xB5", "\xB6", "\xB7", "\xB8", "\xB9", "\xBA", "\xBB", "\xBC", "\xBD", "\xBE", "\xBF",
  2961. "\xC0", "\xC1", "\xC2", "\xC3", "\xC4", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCC", "\xCD", "\xCE", "\xCF",
  2962. "\xD0", "\xD1", "\xD2", "\xD3", "\xD4", "\xD5", "\xD6", "\xD7", "\xD8", "\xD9", "\xDA", "\xDB", "\xDC", "\xDD", "\xDE", "\xDF",
  2963. "\xE0", "\xE1", "\xE2", "\xE3", "\xE4", "\xE5", "\xE6", "\xE7", "\xE8", "\xE9", "\xEA", "\xEB", "\xEC", "\xED", "\xEE", "\xEF",
  2964. "\xF0", "\xF1", "\xF2", "\xF3", "\xF4", "\xF5", "\xF6", "\xF7", "\xF8", "\xF9", "\xFA", "\xFB", "\xFC", "\xFD", "\xFE", "\xFF"
  2965. ) ;
  2966.  
  2967. // Dictionary lengths - when we reach one of the values specified as the key, we have to set the bit length to the corresponding value
  2968. static $DictionaryLengths = array
  2969. (
  2970. 511 => 10,
  2971. 1023 => 11,
  2972. 2047 => 12
  2973. ) ;
  2974.  
  2975. // Decoded string to be returned
  2976. $result = '' ;
  2977.  
  2978. // Convert string to binary string
  2979. $bit_string = '' ;
  2980. $data_length = strlen ( $data ) ;
  2981.  
  2982. for ( $i = 0 ; $i < $data_length ; $i ++ )
  2983. $bit_string .= sprintf ( '%08b', ord ( $data[$i] ) ) ;
  2984.  
  2985. $data_length *= 8 ;
  2986.  
  2987. // Initialize dictionary
  2988. $bit_length = 9 ;
  2989. $dictionary_index = 258 ;
  2990. $dictionary = $InitialDictionary ;
  2991.  
  2992. // Previous value
  2993. $previous_index = 0 ;
  2994.  
  2995. // Start index in bit string
  2996. $start_index = 0 ;
  2997.  
  2998. // Until we encounter the EOD marker (257), read $bit_length bits
  2999. while ( ( $start_index < $data_length ) && ( ( $index = bindec ( substr ( $bit_string, $start_index, $bit_length ) ) ) !== 257 ) )
  3000. {
  3001. // Move to next bit position
  3002. $start_index += $bit_length ;
  3003.  
  3004. if ( $index !== 256 && $previous_index !== 256 )
  3005. {
  3006. // Check if index exists in the dictionary and remember it
  3007. if ( $index < $dictionary_index )
  3008. {
  3009. $result .= $dictionary [ $index ] ;
  3010. $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $index ] [0] ;
  3011. $previous_index = $index ;
  3012. }
  3013. // Index does not exist - add it to the dictionary
  3014. else
  3015. {
  3016. $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $previous_index ] [0] ;
  3017. $result .= $dictionary_value ;
  3018. }
  3019.  
  3020. // Update dictionary
  3021. $dictionary [ $dictionary_index ++ ] = $dictionary_value ;
  3022.  
  3023. // Change bit length whenever we reach an index limit
  3024. if ( isset ( $DictionaryLengths [ $dictionary_index ] ) )
  3025. $bit_length = $DictionaryLengths [ $dictionary_index ] ;
  3026. }
  3027. // Clear table marker
  3028. else if ( $index === 256)
  3029. {
  3030. // Reset dictionary and bit length
  3031. // Reset dictionary and bit length
  3032. $bit_length = 9 ;
  3033. $dictionary_index = 258 ;
  3034. $previous_index = 256 ;
  3035. $dictionary = $InitialDictionary ;
  3036. }
  3037. // First entry
  3038. else // $previous_index === 256
  3039. {
  3040. // first entry
  3041. $result .= $dictionary [ $index ] ;
  3042. $previous_index = $index ;
  3043. }
  3044. }
  3045.  
  3046. // All done, return
  3047. return ( $result ) ;
  3048. }
  3049.  
  3050.  
  3051. // __decode_ascii_hex -
  3052. // Decoder for /AsciiHexDecode streams.
  3053. private function __decode_ascii_hex ( $input )
  3054. {
  3055. $output = "" ;
  3056. $is_odd = true ;
  3057. $is_comment = false ;
  3058.  
  3059. for ( $i = 0, $codeHigh = -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++ )
  3060. {
  3061. $c = $input [ $i ] ;
  3062.  
  3063. if ( $is_comment )
  3064. {
  3065. if ( $c == '\r' || $c == '\n' )
  3066. $is_comment = false ;
  3067.  
  3068. continue;
  3069. }
  3070.  
  3071. switch ( $c )
  3072. {
  3073. case '\0' :
  3074. case '\t' :
  3075. case '\r' :
  3076. case '\f' :
  3077. case '\n' :
  3078. case ' ' :
  3079. break ;
  3080.  
  3081. case '%' :
  3082. $is_comment = true ;
  3083. break ;
  3084.  
  3085. default :
  3086. $code = hexdec ( $c ) ;
  3087.  
  3088. if ( $code === 0 && $c != '0' )
  3089. return ( '' ) ;
  3090.  
  3091. if ( $is_odd )
  3092. $codeHigh = $code ;
  3093. else
  3094. $output .= chr ( ( $codeHigh << 4 ) | $code ) ;
  3095.  
  3096. $is_odd = ! $is_odd ;
  3097. break ;
  3098. }
  3099. }
  3100.  
  3101. if ( $input [ $i ] != '>' )
  3102. return ( '' ) ;
  3103.  
  3104. if ( $is_odd )
  3105. $output .= chr ( $codeHigh << 4 ) ;
  3106.  
  3107. return ( $output ) ;
  3108. }
  3109.  
  3110.  
  3111. // __decode_ascii_85 -
  3112. // Decoder for /Ascii85Decode streams.
  3113. private function __decode_ascii_85 ( $data )
  3114. {
  3115. // Ordinal value of the first character used in Ascii85 encoding
  3116. static $first_ord = 33 ;
  3117. // "A 'z' in the input data means "sequence of 4 nuls"
  3118. static $z_exception = "\0\0\0\0" ;
  3119. // Powers of 85, from 4 to 0
  3120. static $exp85 = array ( 52200625, 614125, 7225, 85, 1 ) ;
  3121.  
  3122. // Ignore empty data
  3123. if ( $data === '' )
  3124. return ( false ) ;
  3125.  
  3126. $data_length = strlen ( $data ) ;
  3127. $ords = array ( ) ;
  3128. $ord_count = 0 ;
  3129. $result = '' ;
  3130.  
  3131. // Paranoia : Ascii85 data may start with '<~' (but it always end with '~>'). Anyway, we must start past this construct if present
  3132. if ( $data [0] == '<' && $data [1] == '~' )
  3133. $start = 2 ;
  3134. else
  3135. $start = 0 ;
  3136.  
  3137. // Loop through nput characters
  3138. for ( $i = $start ; $i < $data_length && $data [$i] != '~' ; $i ++ )
  3139. {
  3140. $ch = $data [$i] ;
  3141.  
  3142. // Most common case : current character is in the range of the Ascii85 encoding ('!'..'u')
  3143. if ( $ch >= '!' && $ch <= 'u' )
  3144. $ords [ $ord_count ++ ] = ord ( $ch ) - $first_ord ;
  3145. // 'z' is replaced with a sequence of null bytes
  3146. else if ( $ch == 'z' && ! $ord_count )
  3147. $result .= $z_exception ;
  3148. // Spaces are ignored
  3149. else if ( $ch !== "\0" && $ch !== "\t" && $ch !== ' ' && $ch !== "\r" && $ch !== "\n" && $ch !== "\f" )
  3150. continue ;
  3151. // Other characters : corrupted data...
  3152. else
  3153. return ( false ) ;
  3154.  
  3155. // We have collected 5 characters in base 85 : convert their 32-bits value to base 2 (3 characters)
  3156. if ( $ord_count == 5 )
  3157. {
  3158. $ord_count = 0 ;
  3159.  
  3160. for ( $sum = 0, $j = 0 ; $j < 5 ; $j ++ )
  3161. $sum = ( $sum * 85 ) + $ords [ $j ] ;
  3162.  
  3163. for ( $j = 3 ; $j >= 0 ; $j -- )
  3164. $result .= chr ( $sum >> ( $j * 8 ) ) ;
  3165. }
  3166. }
  3167.  
  3168. // A last processing for the potential remaining bytes
  3169. // Notes : this situation has never been tested
  3170. if ( $ord_count )
  3171. {
  3172. for ( $i = 0, $sum = 0 ; $i < $ord_count ; $i++ )
  3173. $sum += ( $ords [ $i ] + ( $i == $ord_count - 1 ) ) * $exp85 [$i] ;
  3174.  
  3175. for ( $i = 0 ; $i < $ord_count - 1 ; $i++ )
  3176. $result .= chr ( $sum >> ( ( 3 - $i ) * 8 ) ) ;
  3177. }
  3178.  
  3179. // All done, return
  3180. return ( $result ) ;
  3181. }
  3182.  
  3183.  
  3184. /*--------------------------------------------------------------------------------------------------------------
  3185.  
  3186. NAME
  3187. DecodeImage - Returns decoded image contents.
  3188.  
  3189. PROTOTYPE
  3190. TBC
  3191.  
  3192. DESCRIPTION
  3193. description
  3194.  
  3195. PARAMETERS
  3196. $object_id (integer) -
  3197. Pdf object number.
  3198.  
  3199. $stream_data (string) -
  3200. Object data.
  3201.  
  3202. $type (integer) -
  3203. One of the PdfToText::PDF_*_ENCODING constants.
  3204.  
  3205. $autosave (boolean) -
  3206. When autosave is selected, images will not be decoded into memory unless they have a format
  3207. different from JPEG. This is intended to save memory.
  3208.  
  3209. RETURN VALUE
  3210. Returns an object of type PdfIMage, or false if the image encoding type is not currently supported.
  3211.  
  3212. *-------------------------------------------------------------------------------------------------------------*/
  3213. protected function DecodeImage ( $object_id, $stream_data, $type, $object_data, $autosave )
  3214. {
  3215. switch ( $type )
  3216. {
  3217. // Normal JPEG image
  3218. case self::PDF_DCT_ENCODING :
  3219. return ( new PdfJpegImage ( $stream_data, $autosave ) ) ;
  3220.  
  3221. // CCITT fax image
  3222. case self::PDF_CCITT_FAX_ENCODING :
  3223. return ( new PdfFaxImage ( $stream_data ) ) ;
  3224.  
  3225. // For now, I have not found enough information to be able to decode image data in an inflated stream...
  3226. // In some cases, however, this is JPEG data
  3227. case self::PDF_FLATE_ENCODING :
  3228. $image = PdfInlinedImage::CreateInstance ( $stream_data, $object_data, $autosave ) ;
  3229.  
  3230. if ( $image )
  3231. return ( $image ) ;
  3232.  
  3233. break ;
  3234.  
  3235. default :
  3236. return ( false ) ;
  3237. }
  3238.  
  3239. return ( false ) ;
  3240. }
  3241.  
  3242.  
  3243. /*--------------------------------------------------------------------------------------------------------------
  3244.  
  3245. NAME
  3246. DecodeObjectStream - Decodes an object stream.
  3247.  
  3248. PROTOTYPE
  3249. $array = $this -> DecodeObjectStream ( $object_id, $object_data ) ;
  3250.  
  3251. DESCRIPTION
  3252. Decodes an object stream. An object stream is yet another PDF object type that contains itself several
  3253. objects not defined using the "x y obj ... endobj" syntax.
  3254. As far as I understood, object streams data is contained within stream/endstream delimiters, and is
  3255. gzipped.
  3256. Object streams start with a set of object id/offset pairs separated by a space ; catenated object data
  3257. immediately follows the last space ; for example :
  3258.  
  3259. 1167 0 1168 114 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>[/ICCBased 1156 0 R]
  3260.  
  3261. The above example specifies two objects :
  3262. . Object #1167, which starts at offset 0 and ends before the second object, at offset #113 in
  3263. the data. The contents are :
  3264. <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>
  3265. . Object #1168, which starts at offset #114 and continues until the end of the object stream.
  3266. It contains the following data :
  3267. [/ICCBased 1156 0 R]
  3268.  
  3269. PARAMETERS
  3270. $object_id (integer) -
  3271. Pdf object number.
  3272.  
  3273. $object_data (string) -
  3274. Object data.
  3275.  
  3276. RETURN VALUE
  3277. Returns false if any error occurred (mainly for syntax reasons).
  3278. Otherwise, returns an associative array containing the following elements :
  3279. - object_id :
  3280. Array of all the object ids contained in the object stream.
  3281. - object :
  3282. Array of corresponding object data.
  3283.  
  3284. The reason for this format is that it is identical to the array returned by the preg_match() function
  3285. used in the Load() method for finding objects in a PDF file (ie, a regex that matches "x y oj/endobj"
  3286. constructs).
  3287.  
  3288. *-------------------------------------------------------------------------------------------------------------*/
  3289. protected function DecodeObjectStream ( $object_id, $object_data )
  3290. {
  3291. // Extract gzipped data for this object
  3292. if ( preg_match ( '#[^/] stream ( (\r? \n) | \r ) (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
  3293. {
  3294. $stream_data = $stream_match [ 'stream' ] ;
  3295. $type = $this -> GetEncodingType ( $object_id, $object_data ) ;
  3296. $decoded_data = $this -> DecodeData ( $object_id, $stream_data, $type, $object_data ) ;
  3297.  
  3298. if ( self::$DEBUG > 1 )
  3299. echo "\n----------------------------------- OBJSTREAM #$object_id\n$decoded_data" ;
  3300. }
  3301. // Stay prepared to find one day a sample declared as an object stream but not having gzipped data delimited by stream/endstream tags
  3302. else
  3303. {
  3304. if ( self::$DEBUG > 1 )
  3305. error ( new PdfToTextDecodingException ( "Found object stream without gzipped data", $object_id ) ) ;
  3306.  
  3307. return ( false ) ;
  3308. }
  3309.  
  3310. // Object streams data start with a series of object id/offset pairs. The offset is absolute to the first character
  3311. // after the last space of these series.
  3312. // Note : on Windows platforms, the default stack size is 1Mb. The following regular expression will make Apache crash in most cases,
  3313. // so you have to enable the following lines in your http.ini file to set a stack size of 8Mb, as for Unix systems :
  3314. // Include conf/extra/httpd-mpm.conf
  3315. // ThreadStackSize 8388608
  3316. if ( ! preg_match ( '/^ \s* (?P<series> (\d+ \s* )+ )/x', $decoded_data, $series_match ) )
  3317. {
  3318. if ( self::$DEBUG > 1 )
  3319. error ( new PdfToTextDecodingException ( "Object stream does not start with integer object id/offset pairs.", $object_id ) ) ;
  3320.  
  3321. return ( false ) ;
  3322. }
  3323.  
  3324. // Extract the series of object id/offset pairs and the stream object data
  3325. $series = explode ( ' ', rtrim ( preg_replace ( '/\s+/', ' ', $series_match [ 'series' ] ) ) ) ;
  3326. $data = substr ( $decoded_data, strlen ( $series_match [ 'series' ] ) ) ;
  3327.  
  3328. // $series should contain an even number of values
  3329. if ( count ( $series ) % 2 )
  3330. {
  3331. if ( self::$DEBUG )
  3332. warning ( new PdfToTextDecodingException ( "Object stream should start with an even number of integer values.", $object_id ) ) ;
  3333.  
  3334. array_pop ( $series ) ;
  3335. }
  3336.  
  3337. // Extract every individual object
  3338. $objects = array ( 'object_id' => array ( ), 'object' => array ( ) ) ;
  3339.  
  3340. for ( $i = 0, $count = count ( $series ) ; $i < $count ; $i += 2 )
  3341. {
  3342. $object_id = ( integer ) $series [$i] ;
  3343. $offset = ( integer ) $series [$i+1] ;
  3344.  
  3345. // If there is a "next" object, extract only a substring within the object stream contents
  3346. if ( isset ( $series [ $i + 3 ] ) )
  3347. $object_contents = substr ( $data, $offset, $series [ $i + 3 ] - $offset ) ;
  3348. // Otherwise, extract everything until the end
  3349. else
  3350. $object_contents = substr ( $data, $offset ) ;
  3351.  
  3352. $objects [ 'object_id'] [] = $object_id ;
  3353. $objects [ 'object' ] [] = $object_contents ;
  3354. }
  3355.  
  3356. return ( $objects ) ;
  3357. }
  3358.  
  3359.  
  3360. /*--------------------------------------------------------------------------------------------------------------
  3361.  
  3362. NAME
  3363. ExtractTextData - Extracts text, header & footer information from a text object.
  3364.  
  3365. PROTOTYPE
  3366. $this -> ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) ;
  3367.  
  3368. DESCRIPTION
  3369. Extracts text, header & footer information from a text object. The extracted text contents will be
  3370. stripped from any header/footer information.
  3371.  
  3372. PARAMETERS
  3373. $text (string) -
  3374. Variable that will receive text contents.
  3375.  
  3376. $header, $footer (string) -
  3377. Variables that will receive header and footer information.
  3378.  
  3379. *-------------------------------------------------------------------------------------------------------------*/
  3380. protected function ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer )
  3381. {
  3382. // Normally, a header or footer is introduced with a construct like :
  3383. // << /Type /Pagination ... [/Bottom] ... >> (or [/Top]
  3384. // The initial regular expression was :
  3385. // << .*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] .*? >> \s* BDC .*? EMC
  3386. // (the data contained between the BDC and EMC instructions are text-drawing instructions).
  3387. // However, this expression revealed to be too greedy and captured too much data ; in the following example :
  3388. // <</MCID 0>> ...(several kb of drawing instructions)... << ... [/Bottom] ... >> BDC (other drawing instructions for the page footer) EMC
  3389. // everything was captured, from the initial "<<M/MCID 0>>" to the final "EMC", which caused regular page contents to be interpreted as page bottom
  3390. // contents.
  3391. // The ".*?" in the regex has been replaced with "[^>]*?", which works better. However, it will fail to recognize header/footer contents if
  3392. // the header/footer declaration contains a nested construct , such as :
  3393. // << /Type /Pagination ... [/Bottom] ... << (some nested contents) >> ... >> (or [/Top]
  3394. // Let's wait for the case to happen one day...
  3395. static $header_or_footer_re = '#
  3396. (?P<contents>
  3397. << [^>]*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] [^>]*? >> \s*
  3398. BDC .*? EMC
  3399. )
  3400. #imsx' ;
  3401.  
  3402. $header =
  3403. $footer =
  3404. $text = '' ;
  3405.  
  3406. if ( preg_match_all ( $header_or_footer_re, $stream_contents, $matches, PREG_OFFSET_CAPTURE ) )
  3407. {
  3408. for ( $i = 0, $count = count ( $matches [ 'contents' ] ) ; $i < $count ; $i ++ )
  3409. {
  3410. if ( ! strcasecmp ( $matches [ 'location' ] [$i] [0], 'Bottom' ) )
  3411. $footer = $matches [ 'contents' ] [$i] [0] ;
  3412. else
  3413. $header = $matches [ 'contents' ] [$i] [0] ;
  3414. }
  3415.  
  3416. $text = preg_replace ( $header_or_footer_re, '', $stream_contents ) ;
  3417. }
  3418. else
  3419. $text = $stream_contents ;
  3420. }
  3421.  
  3422.  
  3423. /*--------------------------------------------------------------------------------------------------------------
  3424.  
  3425. NAME
  3426. ExtractText - extracts text from a pdf stream.
  3427.  
  3428. PROTOTYPE
  3429. $text = $this -> ExtractText ( $page_number, $object_id, $data, &$current_font ) ;
  3430.  
  3431. DESCRIPTION
  3432. Extracts text from decoded stream contents.
  3433.  
  3434. PARAMETERS
  3435. $page_number (integer) -
  3436. ¨Page number that contains the text to be extracted.
  3437.  
  3438. $object_id (integer) -
  3439. Object id of this text block.
  3440.  
  3441. $data (string) -
  3442. Stream contents.
  3443.  
  3444. $current_font (integer) -
  3445. Id of the current font, which should be found in the $this->FontTable property, if anything
  3446. went ok.
  3447. This parameter is required, since text blocks may not specify a new font resource id and reuse
  3448. the one that waas set before.
  3449.  
  3450. RETURN VALUE
  3451. Returns the decoded text.
  3452.  
  3453. NOTES
  3454. The PDF language can be seen as a stack-driven language ; for example, the instruction defining a text
  3455. matrix ( "Tm" ) expects 6 floating-point values from the stack :
  3456.  
  3457. 0 0 0 0 x y Tm
  3458.  
  3459. It can also specify specific operators, such as /Rx, which sets font number "x" to be the current font,
  3460. or even "<< >>" constructs that we can ignore during our process of extracting textual data.
  3461. Actually, we only want to handle a very small subset of the Adobe drawing language ; These are :
  3462. - "Tm" instructions, that specify, among others, the x and y coordinates of the next text to be output
  3463. - "/R" instructions, that specify which font is to be used for the next text output. This is useful
  3464. only if the font has an associated character map.
  3465. - "/F", same as "/R", but use a font map id instead of a direct object id.
  3466. - Text, specified either using a single notation ( "(sometext)" ) or the array notation
  3467. ( "[(...)d1(...)d2...(...)]" ), which allows for specifying inter-character spacing.
  3468. - "Tf" instructions, that specifies the font size. This is to be able to compute approximately the
  3469. number of empty lines between two successive Y coordinates in "Tm" instructions
  3470. - "TL" instructions, that define the text leading to be used by "T*"
  3471.  
  3472. This is why I choosed to decompose the process of text extraction into three steps :
  3473. - The first one, the lowest-level step, is a tokenizer that extracts individual elements, such as "Tm",
  3474. "TJ", "/Rx" or "510.77". This is handled by the __next_token() method.
  3475. - The second one, __next_instruction(), collects tokens. It pushes every floating-point value onto the
  3476. stack, until an instruction is met.
  3477. - The third one, ExtractText(), processes data returned by __next_instruction(), and actually performs
  3478. the (restricted) parsing of text drawing instructions.
  3479.  
  3480. *-------------------------------------------------------------------------------------------------------------*/
  3481. protected function ExtractText ( $page_number, $object_id, $data, &$current_font )
  3482. {
  3483. $new_data = $this -> __strip_useless_instructions ( $data ) ;
  3484.  
  3485. if ( self::$DEBUG )
  3486. {
  3487. echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
  3488. echo $data ;
  3489. echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
  3490. echo $new_data ;
  3491. }
  3492.  
  3493. $data = $new_data ;
  3494.  
  3495. // Index into the specified block of text-drawing instructions
  3496. $data_index = 0 ;
  3497.  
  3498. $data_length = strlen ( $data ) ; // Data length
  3499. $result = '' ; // Resulting string
  3500.  
  3501. // Y-coordinate of the last seen "Tm" instruction
  3502. $last_goto_y = 0 ;
  3503. $last_goto_x = 0 ;
  3504.  
  3505. // Y-coordinate of the last seen "Td" or "TD" relative positioning instruction
  3506. $last_relative_goto_y = 0 ;
  3507.  
  3508. // When true, the current text should be output on the same line as the preceding one
  3509. $use_same_line = false ;
  3510.  
  3511. // Instruction preceding the current one
  3512. $last_instruction = true ;
  3513.  
  3514. // Current font size
  3515. $current_font_size = 0 ;
  3516.  
  3517. // Active template
  3518. $current_template = '' ;
  3519.  
  3520. // Various pre-computed variables
  3521. $separator_length = strlen ( $this -> Separator ) ;
  3522.  
  3523. // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
  3524. $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
  3525.  
  3526. // Extra newlines to add before the current text
  3527. $extra_newlines = 0 ;
  3528.  
  3529. // Text leading used by T*
  3530. $text_leading = 0 ;
  3531.  
  3532. // Set to true if a separator needs to be inserted
  3533. $needs_separator = false ;
  3534.  
  3535. // A flag to tell if we should "forget" the last instruction
  3536. $discard_last_instruction = false ;
  3537.  
  3538. // A flag that tells whether the Separator and BlockSeparator properties are identical
  3539. $same_separators = ( $this -> Separator == $this -> BlockSeparator ) ;
  3540.  
  3541. // Instruction count (used for handling execution timeouts)
  3542. $instruction_count = 0 ;
  3543.  
  3544. // Unprocessed markers
  3545. $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
  3546.  
  3547. // Loop through instructions
  3548. while ( ( $instruction = $this -> __next_instruction ( $page_number, $data, $data_length, $data_index, $current_template ) ) !== false )
  3549. {
  3550. $fragment = '' ;
  3551.  
  3552. $instruction_count ++ ;
  3553.  
  3554. // Timeout handling - don't test for every instruction processed
  3555. if ( ! ( $instruction_count % 100 ) )
  3556. {
  3557. // Global timeout handling
  3558. if ( $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME )
  3559. {
  3560. $now = microtime ( true ) ;
  3561.  
  3562. if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime )
  3563. error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
  3564. }
  3565.  
  3566. // Per-instance timeout handling
  3567. if ( $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME )
  3568. {
  3569. $now = microtime ( true ) ;
  3570.  
  3571. if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime )
  3572. error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
  3573. }
  3574. }
  3575.  
  3576. // Character position after the current instruction
  3577. $data_index = $instruction [ 'next' ] ;
  3578.  
  3579. // Process current instruction
  3580. switch ( $instruction [ 'instruction' ] )
  3581. {
  3582. // Raw text (enclosed by parentheses) or array text (enclosed within square brackets)
  3583. // is returned as a single instruction
  3584. case 'text' :
  3585. // Empty arrays of text may be encountered - ignore them
  3586. if ( ! count ( $instruction [ 'values' ] ) )
  3587. break ;
  3588.  
  3589. // Check if we have to insert a newline
  3590. if ( ! $use_same_line )
  3591. {
  3592. $fragment .= $this -> EOL ;
  3593. $needs_separator = false ;
  3594. }
  3595. // Roughly simulate spacing between lines by inserting newline characters
  3596. else if ( $extra_newlines > 0 )
  3597. {
  3598. $fragment .= str_repeat ( $this -> EOL, $extra_newlines ) ;
  3599. $extra_newlines = 0 ;
  3600. $needs_separator = false ;
  3601. }
  3602. else
  3603. $needs_separator = true ;
  3604.  
  3605. // Add a separator if necessary
  3606. if ( $needs_separator )
  3607. {
  3608. // If the Separator and BlockSeparator properties are the same (and not empty), only add a block separator if
  3609. // the current result does not end with it
  3610. if ( $same_separators )
  3611. {
  3612. if ( $this -> Separator != '' && substr ( $fragment, - $separator_length ) != $this -> BlockSeparator )
  3613. $fragment .= $this -> BlockSeparator ;
  3614. }
  3615. else
  3616. $fragment .= $this -> BlockSeparator ;
  3617. }
  3618.  
  3619. $needs_separator = true ;
  3620. $value_index = 0 ;
  3621.  
  3622. // Fonts having character maps will require some special processing
  3623. if ( $current_font_mapped )
  3624. {
  3625. // Loop through each text value
  3626. foreach ( $instruction [ 'values' ] as $text )
  3627. {
  3628. $is_hex = ( $text [0] == '<' ) ;
  3629. $length = strlen ( $text ) - 1 ;
  3630. $handled = false ;
  3631.  
  3632. // Characters are encoded within angle brackets ( "<>" ).
  3633. // Note that several characters can be specified within the same angle brackets, so we have to take
  3634. // into account the width we detected in the begincodespancerange construct
  3635. if ( $is_hex )
  3636. {
  3637. for ( $i = 1 ; $i < $length ; $i += $current_font_map_width )
  3638. {
  3639. $value = substr ( $text, $i, $current_font_map_width ) ;
  3640. $ch = hexdec ( $value ) ;
  3641.  
  3642. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
  3643. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
  3644. else if ( $current_font == -1 )
  3645. {
  3646. $newchar = chr ( $ch ) ;
  3647. }
  3648. else
  3649. {
  3650. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
  3651. $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
  3652. }
  3653.  
  3654. $fragment .= $newchar ;
  3655. }
  3656.  
  3657. $handled = true ;
  3658. }
  3659. // Yes ! double-byte codes can also be specified as plain text within parentheses !
  3660. // However, we have to be really careful here ; the sequence :
  3661. // (Be)
  3662. // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
  3663. // We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
  3664. // if not, then we have to consider that it is regular text to be taken one character by
  3665. // one character. In this case, we fall back to the "if ( ! $handled )" condition
  3666. else if ( $current_font_map_width == 4 )
  3667. {
  3668. $temp_result = '' ;
  3669.  
  3670. for ( $i = 1 ; $i < $length ; $i ++ )
  3671. {
  3672. // Each character in the pair may be a backslash, which escapes the next character so we must skip it
  3673. // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
  3674. if ( $text [$i] != '\\' )
  3675. $ch1 = $text [$i] ;
  3676. else
  3677. {
  3678. $i ++ ;
  3679.  
  3680. if ( $text [$i] < '0' || $text [$i] > '7' )
  3681. $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
  3682. else
  3683. {
  3684. $oct = '' ;
  3685. $digit_count = 0 ;
  3686.  
  3687. while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
  3688. {
  3689. $oct .= $text [$i ++] ;
  3690. $digit_count ++ ;
  3691. }
  3692.  
  3693. $ch1 = chr ( octdec ( $oct ) ) ;
  3694. $i -- ;
  3695. }
  3696. }
  3697.  
  3698. $i ++ ;
  3699.  
  3700. if ( $text [$i] != '\\' )
  3701. $ch2 = $text [$i] ;
  3702. else
  3703. {
  3704. $i ++ ;
  3705.  
  3706. if ( $text [$i] < '0' || $text [$i] > '7' )
  3707. $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
  3708. else
  3709. {
  3710. $oct = '' ;
  3711. $digit_count = 0 ;
  3712.  
  3713. while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
  3714. {
  3715. $oct .= $text [$i ++] ;
  3716. $digit_count ++ ;
  3717. }
  3718.  
  3719. $ch2 = chr ( octdec ( $oct ) ) ;
  3720. $i -- ;
  3721. }
  3722. }
  3723.  
  3724. // Build the 2-bytes character code
  3725. $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ;
  3726.  
  3727. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
  3728. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
  3729. else
  3730. {
  3731. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
  3732. $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
  3733. }
  3734.  
  3735. // Yes !!! for characters encoded with two bytes, we can find the following construct :
  3736. // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
  3737. // which must be expanded as : (Car)
  3738. // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
  3739. // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
  3740. // for the next quirk to happen...
  3741. if ( $newchar == '\\' && isset ( $text [ $i + 2 ] ) )
  3742. {
  3743. $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
  3744. $i ++ ; // this time we processed 3 bytes, not 2
  3745. }
  3746.  
  3747. $temp_result .= $newchar ;
  3748. }
  3749.  
  3750. // Happens only if we were unable to translate a character using the current character map
  3751. $fragment .= $temp_result ;
  3752. $handled = true ;
  3753. }
  3754.  
  3755. // Character strings within parentheses.
  3756. // For every text value, use the character map table for substitutions
  3757. if ( ! $handled )
  3758. {
  3759. for ( $i = 1 ; $i < $length ; $i ++ )
  3760. {
  3761. $ch = $text [$i] ;
  3762.  
  3763. // Set to true to optimize calls to MapCharacters
  3764. // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
  3765. $use_map_buffer = false ;
  3766.  
  3767. // ... but don't forget to handle escape sequences "\n" and "\r" for characters
  3768. // 10 and 13
  3769. if ( $ch == '\\' )
  3770. {
  3771. $ch = $text [++$i] ;
  3772.  
  3773. // Escaped character
  3774. if ( $ch < '0' || $ch > '7' )
  3775. $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
  3776. // However, an octal form can also be specified ; in this case we have to take into account
  3777. // the character width for the current font (if the character width is 4 hex digits, then we
  3778. // will encounter constructs such as "\000\077").
  3779. // The method used here is dirty : we build a regex to match octal character representations on a substring
  3780. // of the text
  3781. else
  3782. {
  3783. $width = $current_font_map_width / 2 ; // Convert to byte count
  3784. $subtext = substr ( $text, $i - 1 ) ;
  3785. $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
  3786.  
  3787. $status = preg_match ( $regex, $subtext, $octal_matches ) ;
  3788.  
  3789. if ( $status )
  3790. {
  3791. $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
  3792. $ord = 0 ;
  3793.  
  3794. foreach ( $octal_values as $octal_value )
  3795. $ord = ( $ord << 8 ) + octdec ( $octal_value ) ;
  3796.  
  3797. $ch = chr ( $ord ) ;
  3798. $i += strlen ( $octal_matches [0] ) - 2 ;
  3799. }
  3800. }
  3801.  
  3802. $use_map_buffer = false ;
  3803. }
  3804.  
  3805. // Add substituted character to the output result
  3806. $ord = ord ( $ch ) ;
  3807.  
  3808. if ( ! $use_map_buffer )
  3809. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  3810. else
  3811. {
  3812. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
  3813. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
  3814. else
  3815. {
  3816. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  3817. $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
  3818. }
  3819. }
  3820.  
  3821. $fragment .= $newchar ;
  3822. }
  3823. }
  3824.  
  3825. // Handle offsets between blocks of characters
  3826. if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) &&
  3827. - ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth )
  3828. $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
  3829.  
  3830. $value_index ++ ;
  3831. }
  3832. }
  3833. // For fonts having no associated character map, we simply encode the string in UTF8
  3834. // after the C-like escape sequences have been processed
  3835. // Note that <xxxx> constructs can be encountered here, so we have to process them as well
  3836. else
  3837. {
  3838. foreach ( $instruction [ 'values' ] as $text )
  3839. {
  3840. $is_hex = ( $text [0] == '<' ) ;
  3841. $length = strlen ( $text ) - 1 ;
  3842.  
  3843. // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
  3844. // Example :
  3845. // (this is a sentence \
  3846. // continued on the next line)
  3847. // Funny isn't it ? so remove such constructs because we don't care
  3848. $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
  3849.  
  3850. // Characters are encoded within angle brackets ( "<>" )
  3851. if ( $is_hex )
  3852. {
  3853. for ( $i = 1 ; $i < $length ; $i += 2 )
  3854. {
  3855. $ch = hexdec ( substr ( $text, $i, 2 ) ) ;
  3856.  
  3857. $fragment .= $this -> CodePointToUtf8 ( $ch ) ;
  3858. }
  3859. }
  3860. // Characters are plain text
  3861. else
  3862. {
  3863. $text = self::Unescape ( $text ) ;
  3864.  
  3865. for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ )
  3866. {
  3867. $ch = $text [$i] ;
  3868. $ord = ord ( $ch ) ;
  3869.  
  3870. if ( $ord < 127 )
  3871. $newchar = $ch ;
  3872. else
  3873. {
  3874. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
  3875. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
  3876. else
  3877. {
  3878. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  3879. $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
  3880. }
  3881. }
  3882.  
  3883. $fragment .= $newchar ;
  3884. }
  3885. }
  3886.  
  3887. // Handle offsets between blocks of characters
  3888. if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) &&
  3889. abs ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth )
  3890. $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
  3891.  
  3892. $value_index ++ ;
  3893. }
  3894. }
  3895.  
  3896. // Process the markers which do not have an associated font yet - this will be done by matching
  3897. // the current text fragment against one of the regular expressions defined.
  3898. // If a match occurs, then all the subsequent text fragment using the same font will be put markers
  3899. for ( $j = 0 ; $j < $unprocessed_marker_count ; $j ++ )
  3900. {
  3901. $marker = $this -> UnprocessedMarkerList [ 'font' ] [$j] ;
  3902.  
  3903. if ( preg_match ( $marker [ 'regex' ], trim ( $fragment ) ) )
  3904. {
  3905. $this -> TextWithFontMarkers [ $current_font ] = array
  3906. (
  3907. 'font' => $current_font,
  3908. 'height' => $current_font_size,
  3909. 'regex' => $marker [ 'regex' ],
  3910. 'start' => $marker [ 'start' ],
  3911. 'end' => $marker [ 'end' ]
  3912. ) ;
  3913.  
  3914. $unprocessed_marker_count -- ;
  3915. unset ( $this -> UnprocessedMarkerList [ 'font' ] [$j] ) ;
  3916.  
  3917. break ;
  3918. }
  3919. }
  3920.  
  3921. // Check if we need to add markers around this text fragment
  3922. if ( isset ( $this -> TextWithFontMarkers [ $current_font ] ) &&
  3923. $this -> TextWithFontMarkers [ $current_font ] [ 'height' ] == $current_font_size )
  3924. {
  3925. $fragment = $this -> TextWithFontMarkers [ $current_font ] [ 'start' ] .
  3926. $fragment .
  3927. $this -> TextWithFontMarkers [ $current_font ] [ 'end' ] ;
  3928. }
  3929.  
  3930. $result .= $fragment ;
  3931.  
  3932. break ;
  3933.  
  3934. // An "nl" instruction means TJ, Tj, T* or "'"
  3935. case 'nl' :
  3936. if ( ! $instruction [ 'conditional' ] )
  3937. {
  3938. if ( $instruction [ 'leading' ] && $text_leading && $current_font_size )
  3939. {
  3940. $count = ( integer ) ( ( $text_leading - $current_font_size ) / $current_font_size ) ;
  3941.  
  3942. if ( ! $count )
  3943. $count = 1 ;
  3944. }
  3945. else
  3946. $count = 1 ;
  3947.  
  3948. $extra = str_repeat ( PHP_EOL, $count ) ;
  3949. $result .= $extra ;
  3950. $needs_separator = false ;
  3951. $last_goto_y -= ( $count * $text_leading ) ; // Approximation on y-coord change
  3952. $last_relative_goto_y = 0 ;
  3953. }
  3954.  
  3955. break ;
  3956.  
  3957. // "Tm", "Td" or "TD" : Output text on the same line, if the "y" coordinates are equal
  3958. case 'goto' :
  3959. // Some text is positioned using 'Tm' instructions ; however they can be immediatley followed by 'Td' instructions
  3960. // which give a relative positioning ; so consider that the last instruction wins
  3961. if ( $instruction [ 'relative' ] )
  3962. {
  3963. // Try to put a separator if the x coordinate is non-zero
  3964. //if ( $instruction [ 'x' ] - $last_goto_x >= $current_font_size )
  3965. // $result .= $this -> Separator ;
  3966.  
  3967. $discard_last_instruction = true ;
  3968. $extra_newlines = 0 ;
  3969. $use_same_line = ( ( $last_relative_goto_y - abs ( $instruction [ 'y' ] ) ) <= $current_font_size ) ;
  3970. $last_relative_goto_y = abs ( $instruction [ 'y' ] ) ;
  3971. $last_goto_x = $instruction [ 'x' ] ;
  3972.  
  3973. if ( - $instruction [ 'y' ] > $current_font_size )
  3974. {
  3975. $use_same_line = false ;
  3976.  
  3977. if ( $last_relative_goto_y )
  3978. $extra_newlines = ( integer ) ( $current_font_size / $last_relative_goto_y ) ;
  3979. else
  3980. $extra_newlines = 0 ;
  3981. }
  3982. else if ( ! $instruction [ 'y' ] )
  3983. {
  3984. $use_same_line = true ;
  3985. $extra_newlines = 0 ;
  3986. }
  3987.  
  3988. break ;
  3989. }
  3990. else
  3991. $last_relative_goto_y = 0 ;
  3992.  
  3993. $y = $last_goto_y + $last_relative_goto_y ;
  3994.  
  3995. if ( $instruction [ 'y' ] == $y || abs ( $instruction [ 'y' ] - $y ) < $current_font_size )
  3996. {
  3997. $use_same_line = true ;
  3998. $extra_newlines = 0 ;
  3999. }
  4000. else
  4001. {
  4002. // Compute the number of newlines we have to insert between the current and the next lines
  4003. if ( $current_font_size )
  4004. $extra_newlines = ( integer ) ( ( $y - $instruction [ 'y' ] - $current_font_size ) / $current_font_size ) ;
  4005.  
  4006. $use_same_line = ( $last_goto_y == 0 ) ;
  4007. }
  4008.  
  4009. $last_goto_y = $instruction [ 'y' ] ;
  4010. break ;
  4011.  
  4012. // Set font size
  4013. case 'fontsize' :
  4014. $current_font_size = $instruction [ 'size' ] ;
  4015. break ;
  4016.  
  4017. // "/Rx" : sets the current font
  4018. case 'resource' :
  4019. $current_font = $instruction [ 'resource' ] ;
  4020.  
  4021. $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
  4022. break ;
  4023.  
  4024. // "/TPLx" : references a template, which can contain additional font aliases
  4025. case 'template' :
  4026. if ( $this -> PageMap -> IsValidXObjectName ( $instruction [ 'token' ] ) )
  4027. $current_template = $instruction [ 'token' ] ;
  4028.  
  4029. break ;
  4030.  
  4031. // 'TL' : text leading to be used for the next "T*" in the flow
  4032. case 'leading' :
  4033. if ( ! ( $this -> Options & self::PDFOPT_IGNORE_TEXT_LEADING ) )
  4034. $text_leading = $instruction [ 'size' ] ;
  4035.  
  4036. break ;
  4037.  
  4038.  
  4039. // 'ET' : we have to reset a few things here
  4040. case 'ET' :
  4041. $current_font = -1 ;
  4042. $current_font_map_width = 2 ;
  4043. break ;
  4044. }
  4045.  
  4046. // Remember last instruction - this will help us into determining whether we should put the next text
  4047. // on the current or following line
  4048. if ( ! $discard_last_instruction )
  4049. $last_instruction = $instruction ;
  4050.  
  4051. $discard_last_instruction = false ;
  4052. }
  4053.  
  4054. return ( $this -> __rtl_process ( $result ) ) ;
  4055. }
  4056.  
  4057.  
  4058.  
  4059. // __next_instruction -
  4060. // Retrieves the next instruction from the drawing text block.
  4061. private function __next_instruction ( $page_number, $data, $data_length, $index, $current_template )
  4062. {
  4063. static $last_instruction = false ;
  4064.  
  4065. $ch = '' ;
  4066.  
  4067. // Constructs such as
  4068. if ( $last_instruction )
  4069. {
  4070. $result = $last_instruction ;
  4071. $last_instruction = false ;
  4072.  
  4073. return ( $result ) ;
  4074. }
  4075.  
  4076. // Whether we should compute enhanced statistics
  4077. $enhanced_statistics = $this -> EnhancedStatistics ;
  4078.  
  4079. // Holds the floating-point values encountered so far
  4080. $number_stack = array ( ) ;
  4081.  
  4082. // Loop through the stream of tokens
  4083. while ( ( $part = $this -> __next_token ( $page_number, $data, $data_length, $index ) ) !== false )
  4084. {
  4085. $token = $part [0] ;
  4086. $next_index = $part [1] ;
  4087.  
  4088. // Floating-point number : push it onto the stack
  4089. if ( ( $token [0] >= '0' && $token [0] <= '9' ) || $token [0] == '-' || $token [0] == '+' || $token [0] == '.' )
  4090. {
  4091. $number_stack [] = $token ;
  4092. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
  4093. }
  4094. // 'Tm' instruction : return a "goto" instruction with the x and y coordinates
  4095. else if ( $token == 'Tm' )
  4096. {
  4097. $x = $number_stack [4] ;
  4098. $y = $number_stack [5] ;
  4099.  
  4100. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
  4101.  
  4102. return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => false, 'token' => $token ) ) ;
  4103. }
  4104. // 'Td' or 'TD' instructions : return a goto instruction with the x and y coordinates (1st and 2nd args)
  4105. else if ( $token == 'Td' || $token == 'TD' )
  4106. {
  4107. $x = $number_stack [0] ;
  4108. $y = $number_stack [1] ;
  4109.  
  4110. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
  4111.  
  4112. return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => true, 'token' => $token ) ) ;
  4113. }
  4114. // Output text "'" instruction, with conditional newline
  4115. else if ( $token [0] == "'" )
  4116. {
  4117. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
  4118.  
  4119. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
  4120. }
  4121. // Same as above
  4122. else if ( $token == 'TJ' || $token == 'Tj' )
  4123. {
  4124. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
  4125.  
  4126. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
  4127. }
  4128. // Set font size
  4129. else if ( $token == 'Tf' )
  4130. {
  4131. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
  4132.  
  4133. return ( array ( 'instruction' => 'fontsize', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
  4134. }
  4135. // Text leading (spacing used by T*)
  4136. else if ( $token == 'TL' )
  4137. {
  4138. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
  4139.  
  4140. return ( array ( 'instruction' => 'leading', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
  4141. }
  4142. // Position to next line
  4143. else if ( $token == 'T*' )
  4144. {
  4145. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
  4146.  
  4147. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => true ) ) ;
  4148. }
  4149. // Draw object ("Do"). To prevent different text shapes to appear on the same line, we return a "newline" instruction
  4150. // here. Note that the shape position is not taken into account here, and shapes will be processed in the order they
  4151. // appear in the pdf file (which is likely to be different from their position on a graphic screen).
  4152. else if ( $token == 'Do' )
  4153. {
  4154. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
  4155.  
  4156. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => false, 'token' => $token ) ) ;
  4157. }
  4158. // Raw text output
  4159. else if ( $token [0] == '(' )
  4160. {
  4161. $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index, $enhanced_statistics ) ;
  4162. $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
  4163. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
  4164.  
  4165. if ( $next_part [0] == "'" )
  4166. {
  4167. $last_instruction = $instruction ;
  4168. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
  4169. }
  4170. else
  4171. return ( $instruction ) ;
  4172. }
  4173. // Hex digits within angle brackets
  4174. else if ( $token [0] == '<' )
  4175. {
  4176. $ch = $token [1] ;
  4177. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
  4178. $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
  4179.  
  4180. if ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM )
  4181. {
  4182. $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index ) ;
  4183. $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
  4184.  
  4185. if ( $next_part [0] == "'" )
  4186. {
  4187. $last_instruction = $instruction ;
  4188. return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
  4189. }
  4190. else
  4191. return ( $instruction ) ;
  4192. }
  4193. }
  4194. // Text specified as an array of individual raw text elements, and individual interspaces between characters
  4195. else if ( $token [0] == '[' )
  4196. {
  4197. $values = $this -> __extract_chars_from_array ( $token ) ;
  4198. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
  4199. $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => $values [0], 'offsets' => $values [1], 'token' => $token ) ;
  4200.  
  4201. return ( $instruction ) ;
  4202. }
  4203. // Token starts with a slash : maybe a font specification
  4204. else if ( preg_match ( '#^ ( ' . self::$FontSpecifiers . ' ) #ix', $token ) )
  4205. {
  4206. $key = "$page_number:$current_template:$token" ;
  4207. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
  4208.  
  4209. if ( isset ( $this -> MapIdBuffer [ $key ] ) )
  4210. $id = $this -> MapIdBuffer [ $key ] ;
  4211. else
  4212. {
  4213. $id = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $token ) ;
  4214.  
  4215. $this -> MapIdBuffer [ $key ] = $id ;
  4216. }
  4217.  
  4218. return ( array ( 'instruction' => 'resource', 'next' => $next_index, 'resource' => $id, 'token' => $token ) ) ;
  4219. }
  4220. // Template reference, such as /TPL1. Each reference has initially been replaced by !PDFTOTEXT_TEMPLATE_TPLx during substitution
  4221. // by ProcessTemplateReferences(), because templates not only specify text to be replaced, but also font aliases
  4222. // -and this is the place where we catch font aliases in this case
  4223. else if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
  4224. {
  4225. $current_template = '/' . $match [ 'template' ] ;
  4226. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
  4227.  
  4228. return ( array ( 'instruction' => 'template', 'next' => $next_index, 'token' => $current_template ) ) ;
  4229. }
  4230. // Others, only counted for statistics
  4231. else if ( $token === 'cm' )
  4232. {
  4233. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
  4234. }
  4235. else if ( $token === 'BT' )
  4236. {
  4237. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
  4238.  
  4239. return ( array ( 'instruction' => 'BT', 'next' => $next_index, 'token' => $token ) ) ;
  4240. }
  4241. else if ( $token == 'ET' ) // Nothing special to count here
  4242. {
  4243. return ( array ( 'instruction' => 'ET', 'next' => $next_index, 'token' => $token ) ) ;
  4244. }
  4245. // Other instructions : we're not that much interested in them, so clear the number stack and consider
  4246. // that the current parameters, floating-point values, have been processed
  4247. else
  4248. {
  4249. $number_stack = array ( ) ;
  4250. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
  4251. }
  4252.  
  4253. $index = $next_index ;
  4254. }
  4255.  
  4256. // End of input
  4257. return ( false ) ;
  4258. }
  4259.  
  4260.  
  4261. // __next_token :
  4262. // Retrieves the next token from the drawing instructions stream.
  4263. private function __next_token ( $page_number, $data, $data_length, $index )
  4264. {
  4265. // Skip spaces
  4266. $count = 0 ;
  4267.  
  4268. while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) )
  4269. {
  4270. $index ++ ;
  4271. $count ++ ;
  4272. }
  4273.  
  4274. $enhanced_statistics = $this -> EnhancedStatistics ;
  4275. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ;
  4276.  
  4277. // End of input
  4278. if ( $index >= $data_length )
  4279. return ( false ) ;
  4280.  
  4281. // The current character will tell us what to do
  4282. $ch = $data [ $index ] ;
  4283. $ch2 = '' ;
  4284.  
  4285. switch ( $ch )
  4286. {
  4287. // Opening square bracket : we have to find the closing one, taking care of escape sequences
  4288. // that can also specify a square bracket, such as "\]"
  4289. case "[" :
  4290. $pos = $index + 1 ;
  4291. $parent = 0 ;
  4292. $angle = 0 ;
  4293. $result = $ch ;
  4294.  
  4295. while ( $pos < $data_length )
  4296. {
  4297. $nch = $data [ $pos ++ ] ;
  4298.  
  4299. switch ( $nch )
  4300. {
  4301. case '(' :
  4302. $parent ++ ;
  4303. $result .= $nch ;
  4304. break ;
  4305.  
  4306. case ')' :
  4307. $parent -- ;
  4308. $result .= $nch ;
  4309. break ;
  4310.  
  4311. case '<' :
  4312. // Although the array notation can contain hex digits between angle brackets, we have to
  4313. // take care that we do not have an angle bracket between two parentheses such as :
  4314. // [ (<) ... ]
  4315. if ( ! $parent )
  4316. $angle ++ ;
  4317.  
  4318. $result .= $nch ;
  4319. break ;
  4320.  
  4321. case '>' :
  4322. if ( ! $parent )
  4323. $angle -- ;
  4324.  
  4325. $result .= $nch ;
  4326. break ;
  4327.  
  4328. case '\\' :
  4329. $result .= $nch . $data [ $pos ++ ] ;
  4330. break ;
  4331.  
  4332. case ']' :
  4333. $result .= ']' ;
  4334.  
  4335. if ( ! $parent )
  4336. break 2 ;
  4337. else
  4338. break ;
  4339.  
  4340. case "\n" :
  4341. case "\r" :
  4342. break ;
  4343.  
  4344. default :
  4345. $result .= $nch ;
  4346. }
  4347. }
  4348.  
  4349. return ( array ( $result, $pos ) ) ;
  4350.  
  4351. // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
  4352. // such as "\)"
  4353. case "(" :
  4354. $pos = $index + 1 ;
  4355. $result = $ch ;
  4356.  
  4357. while ( $pos < $data_length )
  4358. {
  4359. $nch = $data [ $pos ++ ] ;
  4360.  
  4361. if ( $nch == '\\' )
  4362. {
  4363. $after = $data [ $pos ] ;
  4364.  
  4365. // Character references specified as \xyz, where "xyz" are octal digits
  4366. if ( $after >= '0' && $after <= '7' )
  4367. {
  4368. $result .= $nch ;
  4369.  
  4370. while ( $data [ $pos ] >= '0' && $data [ $pos ] <= '7' )
  4371. $result .= $data [ $pos ++ ] ;
  4372. }
  4373. // Regular character escapes
  4374. else
  4375. $result .= $nch . $data [ $pos ++ ] ;
  4376. }
  4377. else if ( $nch == ')' )
  4378. {
  4379. $result .= ')' ;
  4380. break ;
  4381. }
  4382. else
  4383. $result .= $nch ;
  4384. }
  4385.  
  4386. return ( array ( $result, $pos ) ) ;
  4387.  
  4388. // A construction of the form : "<< something >>", or a unicode character
  4389. case '<' :
  4390. if ( ! isset ( $data [ $index + 1 ] ) )
  4391. return ( false ) ;
  4392.  
  4393. if ( $data [ $index + 1 ] == '<' )
  4394. {
  4395. $pos = strpos ( $data, '>>', $index + 2 ) ;
  4396.  
  4397. if ( $pos === false )
  4398. return ( false ) ;
  4399.  
  4400. return ( array ( substr ( $data, $index, $pos - $index + 2 ), $pos + 2 ) ) ;
  4401. }
  4402. else
  4403. {
  4404. $pos = strpos ( $data, '>', $index + 2 ) ;
  4405.  
  4406. if ( $pos === false )
  4407. return ( false ) ;
  4408.  
  4409. // There can be spaces and newlines inside a series of hex digits, so remove them...
  4410. $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $pos - $index + 1 ) ) ;
  4411.  
  4412. return ( array ( $result, $pos + 1 ) ) ;
  4413. }
  4414.  
  4415. // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
  4416. case "'" :
  4417. return ( array ( "'", $index + 1 ) ) ;
  4418.  
  4419. // Other cases : this may be either a floating-point number or a keyword
  4420. default :
  4421. $index ++ ;
  4422. $value = $ch ;
  4423.  
  4424. if ( isset ( $data [ $index ] ) )
  4425. {
  4426. if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_DIGIT ) ||
  4427. $ch == '-' || $ch == '+' || $ch == '.' )
  4428. {
  4429. while ( $index < $data_length &&
  4430. ( ( self::$CharacterClasses [ $data [ $index ] ] & self::CTYPE_DIGIT ) ||
  4431. $data [ $index ] == '.' ) )
  4432. $value .= $data [ $index ++ ] ;
  4433. }
  4434. else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) ||
  4435. $ch == '/' || $ch == '!' )
  4436. {
  4437. $ch = $data [ $index ] ;
  4438.  
  4439. while ( $index < $data_length &&
  4440. ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) ||
  4441. $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) )
  4442. {
  4443. $value .= $ch ;
  4444. $index ++ ;
  4445.  
  4446. if ( isset ( $data [ $index ] ) )
  4447. $ch = $data [ $index ] ;
  4448. }
  4449. }
  4450. }
  4451.  
  4452. return ( array ( $value, $index ) ) ;
  4453. }
  4454. }
  4455.  
  4456.  
  4457. /*--------------------------------------------------------------------------------------------------------------
  4458.  
  4459. NAME
  4460. ExtractTextWithLayout - Extracts text, trying to render the page layout.
  4461.  
  4462. $text = $this -> ExtractTextWithLayout ( $page_number, $object_id, $data, &$current_font ) ;
  4463.  
  4464. DESCRIPTION
  4465. Extracts text from decoded stream contents, trying to render the layout.
  4466.  
  4467. PARAMETERS
  4468. $page_number (integer) -
  4469. ¨Page number that contains the text to be extracted.
  4470.  
  4471. $object_id (integer) -
  4472. Object id of this text block.
  4473.  
  4474. $data (string) -
  4475. Stream contents.
  4476.  
  4477. $current_font (integer) -
  4478. Id of the current font, which should be found in the $this->FontTable property, if anything
  4479. went ok.
  4480. This parameter is required, since text blocks may not specify a new font resource id and reuse
  4481. the one that waas set before.
  4482.  
  4483. RETURN VALUE
  4484. Returns the decoded text.
  4485.  
  4486. *-------------------------------------------------------------------------------------------------------------*/
  4487. protected function ExtractTextWithLayout ( &$page_fragments, $page_number, $object_id, $data, &$current_font )
  4488. {
  4489. // Characters that can start a numeric operand
  4490. static $numeric_starts = array
  4491. (
  4492. '+' => true, '-' => true, '.' => true, '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
  4493. '5' => true, '6' => true, '7' => true, '8' => true, '9' => true
  4494. ) ;
  4495. // Initial (default) transformation matrix. To reflect the PDF specifications, we will keep it as a 6 elements array :
  4496. // [ sx tx ty sy x y ]
  4497. // (although tx and ty are not useful here, since they affect the graphic orientation of the text)
  4498. // sx and sy are scaling parameters, actually a multiplier for the x and y parameters. We only keep
  4499. static $IdentityMatrix = array ( 1, 0, 0, 1, 0, 0 ) ;
  4500.  
  4501. // Remove useless instructions
  4502. $new_data = $this -> __strip_useless_instructions ( $data ) ;
  4503.  
  4504. if ( self::$DEBUG )
  4505. {
  4506. echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
  4507. echo $data ;
  4508. echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
  4509. echo $new_data ;
  4510. }
  4511.  
  4512. $data = $new_data ;
  4513. $data_length = strlen ( $data ) ; // Data length
  4514.  
  4515. $page_fragment_count = count ( $page_fragments ) ;
  4516.  
  4517. // Index into the specified block of text-drawing instructions
  4518. $data_index = 0 ;
  4519.  
  4520. // Text matrices
  4521. $CTM =
  4522. $Tm = $IdentityMatrix ;
  4523.  
  4524. // Nesting level of BT..ET instructions (Begin text/End text) - they are not nestable but be prepared to meet buggy PDFs
  4525. $BT_nesting_level = 0 ;
  4526.  
  4527. // Current font data
  4528. $current_font_height = 0 ;
  4529.  
  4530. // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
  4531. $current_template = '' ;
  4532. $current_font_name = '' ;
  4533. $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
  4534.  
  4535. // Operand stack
  4536. $operand_stack = array ( ) ;
  4537.  
  4538. // Number of tokens processed so far
  4539. $token_count = 0 ;
  4540.  
  4541. // Page attributes
  4542. $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ;
  4543.  
  4544. // Graphics context stack - well, we only store here the current transformation matrix
  4545. $graphic_stack = array ( ) ;
  4546. $graphic_stack_size = 0 ;
  4547.  
  4548. // Global/local execution time measurements
  4549. $tokens_between_timechecks = 1000 ;
  4550. $enforce_global_execution_time = $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ;
  4551. $enforce_local_execution_time = $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME ;
  4552. $enforce_execution_time = $enforce_global_execution_time | $enforce_local_execution_time ;
  4553.  
  4554. // Whether we should compute enhanced statistics
  4555. $enhanced_statistics = $this -> EnhancedStatistics ;
  4556.  
  4557. // Whether we should show debug coordinates
  4558. $show_debug_coordinates = ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) ;
  4559.  
  4560. // Text leading value set by the TL instruction
  4561. $text_leading = 0.0 ;
  4562.  
  4563. // Loop through the stream of tokens
  4564. while ( $this -> __next_token_ex ( $page_number, $data, $data_length, $data_index, $token, $next_index ) !== false )
  4565. {
  4566. $token_start = $token [0] ;
  4567. $token_count ++ ;
  4568. $length = $next_index - $data_index - 1 ;
  4569.  
  4570. // Check if we need to enforce execution time checking, to prevent PHP from terminating our script without any hope
  4571. // of catching the error
  4572. if ( $enforce_execution_time && ! ( $token_count % $tokens_between_timechecks ) )
  4573. {
  4574. if ( $enforce_global_execution_time )
  4575. {
  4576. $now = microtime ( true ) ;
  4577.  
  4578. if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime )
  4579. error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
  4580. }
  4581.  
  4582. // Per-instance timeout handling
  4583. if ( $enforce_local_execution_time )
  4584. {
  4585. $now = microtime ( true ) ;
  4586.  
  4587. if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime )
  4588. error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
  4589. }
  4590. }
  4591.  
  4592. /****************************************************************************************************************
  4593.  
  4594. The order of the testings is important for maximum performance : put the most common cases first.
  4595. A study on over 1000 PDF files has shown the following :
  4596.  
  4597. - Instruction operands appear 24.5 million times
  4598. - Tx instructions (including Tf, Tm, ', ", etc.) : 24M
  4599. - (), <> and [] constructs for drawing text : 17M
  4600. - Other : peanuts...
  4601. - Ignored instructions : 0.5M (these are the instructions without interest for text extraction and that
  4602. could not be removed by the __strip_useless_instructions() method).
  4603.  
  4604. Of course, white spaces appear more than 100M times between instructions. However, it gets hard to remove
  4605. most of them without compromising the result of __strip_useless_instructions.
  4606.  
  4607. ***************************************************************************************************************/
  4608. // Numeric or flag for an instruction
  4609. if ( $token_start == '/' || isset ( $numeric_starts [ $token_start ] ) )
  4610. {
  4611. $operand_stack [] = $token ;
  4612.  
  4613. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
  4614. }
  4615. // A 2-characters "Tx" or a 1-character quote/doublequote instruction
  4616. else if ( ( $length === 2 && $token_start === 'T' ) || ( $length === 1 && ( $token_start === "'" || $token_start === '"' ) ) )
  4617. {
  4618. switch ( ( $length === 1 ) ? $token [0] : $token [1] )
  4619. {
  4620. // Tj instruction
  4621. case 'j' :
  4622. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tj' ] ++ ;
  4623. break ;
  4624.  
  4625. // Tm instruction
  4626. case 'm' :
  4627. $Tm [0] = ( double ) $operand_stack [0] ;
  4628. $Tm [1] = ( double ) $operand_stack [1] ;
  4629. $Tm [2] = ( double ) $operand_stack [2] ;
  4630. $Tm [3] = ( double ) $operand_stack [3] ;
  4631. $Tm [4] = ( double ) $operand_stack [4] ;
  4632. $Tm [5] = ( double ) $operand_stack [5] ;
  4633.  
  4634. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
  4635. break ;
  4636.  
  4637. // Tf instruction
  4638. case 'f' :
  4639. $current_font_name = $operand_stack [0] ;
  4640. $key = "$page_number:$current_template:$current_font_name" ;
  4641.  
  4642. // We have to map a font specifier (such /TT0, C0-1, etc.) into an object id.
  4643. // Check first if we already met this font
  4644. if ( isset ( $this -> MapIdBuffer [ $key ] ) )
  4645. $current_font = $this -> MapIdBuffer [ $key ] ;
  4646. // Otherwise retrieve its corresponding object number and put it in our font cache
  4647. else
  4648. {
  4649. $current_font = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $current_font_name ) ;
  4650.  
  4651. $this -> MapIdBuffer [ $key ] = $current_font ;
  4652. }
  4653.  
  4654. $current_font_height = ( double ) $operand_stack [1] ;
  4655. $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
  4656. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
  4657. break ;
  4658.  
  4659. // Td instruction
  4660. case 'd' :
  4661. $Tm [4] += ( double ) $operand_stack [0] * abs ( $Tm [0] ) ;
  4662. $Tm [5] += ( double ) $operand_stack [1] * abs ( $Tm [3] ) ;
  4663.  
  4664. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Td' ] ++ ;
  4665. break ;
  4666.  
  4667. // TJ instruction
  4668. case 'J' :
  4669. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TJ' ] ++ ;
  4670. break ;
  4671.  
  4672. // TD instruction
  4673. case 'D' :
  4674. $Tm [4] += ( double ) $operand_stack [0] * $Tm [0] ;
  4675. $Tm [5] += ( double ) $operand_stack [1] * $Tm [3] ;
  4676. $text_leading -= $Tm [5] ;
  4677.  
  4678. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TD' ] ++ ;
  4679. break ;
  4680.  
  4681. // T* instruction
  4682. case '*' :
  4683. $Tm [4] = 0.0 ;
  4684. $Tm [5] -= $text_leading ; //$current_font_height ;
  4685.  
  4686. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
  4687. break ;
  4688.  
  4689. // TL instruction - Set text leading. Currently not used.
  4690. case 'L' :
  4691. $text_leading = ( double ) $operand_stack [0] ;
  4692. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
  4693. break ;
  4694.  
  4695. // ' instruction : go to next line and display text
  4696. case "'" :
  4697. // Update the coordinates of the last text block found so far
  4698. $page_fragments [ $page_fragment_count - 1 ] [ 'x' ] += $text_leading ;
  4699. $offset = $current_font_height * abs ( $Tm [3] ) ;
  4700. $page_fragments [ $page_fragment_count - 1 ] [ 'y' ] -= $offset ;
  4701.  
  4702. // And don't forget to update the y coordinate of the current transformation matrix
  4703. $Tm [5] -= $offset ;
  4704.  
  4705. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
  4706. break ;
  4707.  
  4708. // "'" instruction
  4709. case '"' :
  4710. if ( self::$DEBUG )
  4711. warning ( "Instruction $token not yet implemented." ) ;
  4712.  
  4713. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '"' ] ++ ;
  4714. break ;
  4715.  
  4716. // Other : ignore them
  4717. default :
  4718. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
  4719. }
  4720.  
  4721. $operand_stack = array ( ) ;
  4722. }
  4723. // cm instruction
  4724. else if ( $token == 'cm' )
  4725. {
  4726. $a = ( double ) $operand_stack [0] ;
  4727. $b = ( double ) $operand_stack [1] ;
  4728. $c = ( double ) $operand_stack [2] ;
  4729. $d = ( double ) $operand_stack [3] ;
  4730. $e = ( double ) $operand_stack [4] ;
  4731. $f = ( double ) $operand_stack [5] ;
  4732.  
  4733. $CTM = array ( $a, $b, $c, $d, $e, $f ) ;
  4734. $operand_stack = array ( ) ;
  4735.  
  4736. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
  4737. }
  4738. // q/Q instructions (save/restore graphic context)
  4739. else if ( $token === 'q' )
  4740. {
  4741. $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ;
  4742. $operand_stack = array ( ) ;
  4743. }
  4744. else if ( $token === 'Q' )
  4745. {
  4746. if ( $graphic_stack_size )
  4747. list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ;
  4748. else if ( self::$DEBUG )
  4749. warning ( "Tried to restore graphics context from an empty stack." ) ;
  4750.  
  4751. $operand_stack = array ( ) ;
  4752. }
  4753. // Text array in the [...] notation. Well, in fact, even non-array constructs are returned as an array by the
  4754. // __next_token() function, for the sake of simplicity
  4755. else if ( $token_start === '[' )
  4756. {
  4757. $text = $this -> __decode_text ( $token, $current_font, $current_font_mapped, $current_font_map_width ) ;
  4758.  
  4759. if ( $text !== '' )
  4760. {
  4761. $r = $this -> __matrix_multiply ( $Tm, $CTM, $page_attributes [ 'width' ], $page_attributes [ 'height' ] ) ;
  4762. $fragment = array
  4763. (
  4764. 'x' => ( $r [4] < 0 ) ? 0.0 : $r [4],
  4765. 'y' => ( $r [5] < 0 ) ? 0.0 : $r [5],
  4766. 'page' => $page_number,
  4767. 'template' => $current_template,
  4768. 'font' => $current_font_name,
  4769. 'font-height' => abs ( $current_font_height * $Tm [3] ),
  4770. 'text' => $text,
  4771. ) ;
  4772.  
  4773. // Add debug information when needed
  4774. if ( self::$DEBUG )
  4775. {
  4776. $fragment = array_merge
  4777. (
  4778. $fragment,
  4779. array
  4780. (
  4781. 'CTM' => $CTM,
  4782. 'Tm' => $Tm,
  4783. 'New Tm' => $r,
  4784. 'Real font height' => $current_font_height,
  4785. 'Page width' => $page_attributes [ 'width' ],
  4786. 'Page height' => $page_attributes ['height' ]
  4787. )
  4788. ) ;
  4789. }
  4790.  
  4791. // Add this text fragment to the list
  4792. $page_fragments [] = $fragment ;
  4793. $page_fragment_count ++ ;
  4794.  
  4795. $operand_stack = array ( ) ;
  4796. }
  4797. }
  4798. // BT instruction
  4799. else if ( $token == 'BT' )
  4800. {
  4801. $BT_nesting_level ++ ;
  4802. $operand_stack = array ( ) ;
  4803. $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ;
  4804.  
  4805. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
  4806. }
  4807. // ET instruction
  4808. else if ( $token == 'ET' )
  4809. {
  4810. if ( $BT_nesting_level )
  4811. {
  4812. $BT_nesting_level -- ;
  4813.  
  4814. if ( ! $BT_nesting_level && $graphic_stack_size )
  4815. {
  4816. list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ;
  4817. }
  4818.  
  4819. }
  4820.  
  4821. $operand_stack = array ( ) ;
  4822. }
  4823. // Template (substituted in __next_token)
  4824. else if ( $token_start === '!' )
  4825. {
  4826. if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
  4827. {
  4828. $name = '/' . $match [ 'template' ] ;
  4829. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
  4830.  
  4831. if ( $this -> PageMap -> IsValidXObjectName ( $name ) )
  4832. $current_template = $name ;
  4833. }
  4834. else
  4835. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
  4836.  
  4837. $operand_stack = array ( ) ;
  4838. }
  4839. // Other instructions
  4840. else
  4841. {
  4842. $operand_stack = array ( ) ;
  4843. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
  4844. }
  4845.  
  4846. // Update current index in instruction stream
  4847. $data_index = $next_index ;
  4848. }
  4849. }
  4850.  
  4851.  
  4852. // __matrix_multiply -
  4853. // Multiplies matrix $ma by $mb.
  4854. // PDF transformation matrices are 3x3 matrices containing the following values :
  4855. //
  4856. // | sx rx 0 |
  4857. // | ry sy 0 |
  4858. // | tx ty 1 |
  4859. //
  4860. // However, we do not care about the 3rd column, which is always hardcoded. Transformation
  4861. // matrices here are implemented 6-elements arrays :
  4862. //
  4863. // [ sx, rx, ry, tx, ty ]
  4864. private function __matrix_multiply ( $ma, $mb, $page_width, $page_height )
  4865. {
  4866. // Scaling text is only appropriate for rendering graphics ; in our case, we just have to render
  4867. // basic text without any consideration about its width or height ; so adjust the sx/sy parameters
  4868. // accordingly
  4869. $scale_1x = ( $ma [0] > 0 ) ? 1 : -1 ;
  4870. $scale_1y = ( $ma [3] > 0 ) ? 1 : -1 ;
  4871. $scale_2x = ( $mb [0] > 0 ) ? 1 : -1 ;
  4872. $scale_2y = ( $mb [3] > 0 ) ? 1 : -1 ;
  4873.  
  4874. // Perform the matrix multiplication
  4875. $r = array ( ) ;
  4876. $r [0] = ( $scale_1x * $scale_2x ) + ( $ma [1] * $mb [2] ) ;
  4877. $r [1] = ( $scale_1x * $mb [1] ) + ( $ma [1] * $scale_2y ) ;
  4878. $r [2] = ( $scale_1y * $scale_2x ) + ( $scale_1y * $mb [2] ) ;
  4879. $r [3] = ( $scale_1y * $mb [1] ) + ( $scale_1y* $scale_2y ) ;
  4880. $r [4] = ( $ma [4] * $scale_2x ) + ( $ma [5] * $mb [2] ) + $mb [4] ;
  4881. $r [5] = ( $ma [4] * $mb [1] ) + ( $ma [5] * $scale_2y ) + $mb [5] ;
  4882.  
  4883. // Negative x/y values are expressed relative to the page width/height (???)
  4884. if ( $r [0] < 0 )
  4885. $r [4] = abs ( $r [4] ) ;//$page_width - $r [4] ;
  4886.  
  4887. if ( $r [3] < 0 )
  4888. $r [5] = abs ( $r [5] ) ; //$page_height - $r [5] ;
  4889.  
  4890. return ( $r ) ;
  4891. }
  4892.  
  4893.  
  4894. // __next_token_ex :
  4895. // Reviewed version of __next_token, adapted to ExtractTextWithLayout.
  4896. // Both functions will be unified when this one will be stabilized.
  4897. private function __next_token_ex ( $page_number, $data, $data_length, $index, &$token, &$next_index )
  4898. {
  4899. // Skip spaces
  4900. $count = 0 ;
  4901.  
  4902. while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) )
  4903. {
  4904. $index ++ ;
  4905. $count ++ ;
  4906. }
  4907.  
  4908. $enhanced_statistics = $this -> EnhancedStatistics ;
  4909. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ;
  4910.  
  4911. // End of input
  4912. if ( $index >= $data_length )
  4913. return ( false ) ;
  4914.  
  4915. // The current character will tell us what to do
  4916. $ch = $data [ $index ] ;
  4917.  
  4918. switch ( $ch )
  4919. {
  4920. // Opening square bracket : we have to find the closing one, taking care of escape sequences
  4921. // that can also specify a square bracket, such as "\]"
  4922. case "[" :
  4923. $next_index = $index + 1 ;
  4924. $parent = 0 ;
  4925. $angle = 0 ;
  4926. $token = '[' ;
  4927.  
  4928. while ( $next_index < $data_length )
  4929. {
  4930. $nch = $data [ $next_index ++ ] ;
  4931.  
  4932. switch ( $nch )
  4933. {
  4934. case '(' :
  4935. $parent ++ ;
  4936. $token .= $nch ;
  4937. break ;
  4938.  
  4939. case ')' :
  4940. $parent -- ;
  4941. $token .= $nch ;
  4942. break ;
  4943.  
  4944. case '<' :
  4945. // Although the array notation can contain hex digits between angle brackets, we have to
  4946. // take care that we do not have an angle bracket between two parentheses such as :
  4947. // [ (<) ... ]
  4948. if ( ! $parent )
  4949. $angle ++ ;
  4950.  
  4951. $token .= $nch ;
  4952. break ;
  4953.  
  4954. case '>' :
  4955. if ( ! $parent )
  4956. $angle -- ;
  4957.  
  4958. $token .= $nch ;
  4959. break ;
  4960.  
  4961. case '\\' :
  4962. $token .= $nch . $data [ $next_index ++ ] ;
  4963. break ;
  4964.  
  4965. case ']' :
  4966. $token .= ']' ;
  4967.  
  4968. if ( ! $parent )
  4969. break 2 ;
  4970. else
  4971. break ;
  4972.  
  4973. case "\n" :
  4974. case "\r" :
  4975. break ;
  4976.  
  4977. default :
  4978. $token .= $nch ;
  4979. }
  4980. }
  4981.  
  4982. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
  4983.  
  4984. return ( true ) ;
  4985.  
  4986. // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
  4987. // such as "\)"
  4988. case "(" :
  4989. $next_index = $index + 1 ;
  4990. $token = '[' . $ch ;
  4991.  
  4992. while ( $next_index < $data_length )
  4993. {
  4994. $nch = $data [ $next_index ++ ] ;
  4995.  
  4996. if ( $nch === '\\' )
  4997. {
  4998. $after = $data [ $next_index ] ;
  4999.  
  5000. // Character references specified as \xyz, where "xyz" are octal digits
  5001. if ( $after >= '0' && $after <= '7' )
  5002. {
  5003. $token .= $nch ;
  5004.  
  5005. while ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '7' )
  5006. $token .= $data [ $next_index ++ ] ;
  5007. }
  5008. // Regular character escapes
  5009. else
  5010. $token .= $nch . $data [ $next_index ++ ] ;
  5011. }
  5012. else if ( $nch === ')' )
  5013. {
  5014. $token .= ')' ;
  5015. break ;
  5016. }
  5017. else
  5018. $token .= $nch ;
  5019. }
  5020.  
  5021. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
  5022. $token .= ']' ;
  5023.  
  5024. return ( true ) ;
  5025.  
  5026. // A construction of the form : "<< something >>", or a unicode character
  5027. case '<' :
  5028. if ( isset ( $data [ $index + 1 ] ) )
  5029. {
  5030. if ( $data [ $index + 1 ] === '<' )
  5031. {
  5032. $next_index = strpos ( $data, '>>', $index + 2 ) ;
  5033.  
  5034. if ( $next_index === false )
  5035. return ( false ) ;
  5036.  
  5037. $token = substr ( $data, $index, $next_index - $index + 2 ) ;
  5038. $next_index += 2 ;
  5039.  
  5040. return ( true ) ;
  5041. }
  5042. else
  5043. {
  5044. $next_index = strpos ( $data, '>', $index + 2 ) ;
  5045.  
  5046. if ( $next_index === false )
  5047. return ( false ) ;
  5048.  
  5049. $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
  5050.  
  5051. // There can be spaces and newlines inside a series of hex digits, so remove them...
  5052. $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $next_index - $index + 1 ) ) ;
  5053.  
  5054. $token = "[$result]" ;
  5055. $next_index ++ ;
  5056.  
  5057. return ( true ) ;
  5058. }
  5059. }
  5060. else
  5061. return ( false ) ;
  5062.  
  5063. // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
  5064. case "'" :
  5065. case '"' :
  5066. $token = $ch ;
  5067. $next_index += 2 ;
  5068.  
  5069. return ( true ) ;
  5070.  
  5071. // Other cases : this may be either a floating-point number or a keyword
  5072. default :
  5073. $next_index = ++ $index ;
  5074. $token = $ch ;
  5075.  
  5076. if ( isset ( $data [ $next_index ] ) )
  5077. {
  5078. if ( ( $ch >= '0' && $ch <= '9' ) || $ch == '-' || $ch == '+' || $ch == '.' )
  5079. {
  5080. while ( $next_index < $data_length &&
  5081. ( ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '9' ) ||
  5082. $data [ $next_index ] === '-' || $data [ $next_index ] === '+' || $data [ $next_index ] === '.' ) )
  5083. $token .= $data [ $next_index ++ ] ;
  5084. }
  5085. else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) ||
  5086. $ch == '/' || $ch == '!' )
  5087. {
  5088. $ch = $data [ $next_index ] ;
  5089.  
  5090. while ( $next_index < $data_length &&
  5091. ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) ||
  5092. $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) )
  5093. {
  5094. $token .= $ch ;
  5095. $next_index ++ ;
  5096.  
  5097. if ( isset ( $data [ $next_index ] ) )
  5098. $ch = $data [ $next_index ] ;
  5099. }
  5100. }
  5101. }
  5102.  
  5103. return ( true ) ;
  5104. }
  5105. }
  5106.  
  5107.  
  5108. // __decode_text -
  5109. // Text decoding function when the PDFOPT_BASIC_LAYOUT flag is specified.
  5110. private function __decode_text ( $data, $current_font, $current_font_mapped, $current_font_map_width )
  5111. {
  5112. list ( $text_values, $offsets ) = $this -> __extract_chars_from_array ( $data ) ;
  5113. $value_index = 0 ;
  5114. $result = '' ;
  5115.  
  5116. // Fonts having character maps will require some special processing
  5117. if ( $current_font_mapped )
  5118. {
  5119. // Loop through each text value
  5120. foreach ( $text_values as $text )
  5121. {
  5122. $is_hex = ( $text [0] == '<' ) ;
  5123. $length = strlen ( $text ) - 1 ;
  5124. $handled = false ;
  5125.  
  5126. // Characters are encoded within angle brackets ( "<>" ).
  5127. // Note that several characters can be specified within the same angle brackets, so we have to take
  5128. // into account the width we detected in the begincodespancerange construct
  5129. if ( $is_hex )
  5130. {
  5131. for ( $i = 1 ; $i < $length ; $i += $current_font_map_width )
  5132. {
  5133. $value = substr ( $text, $i, $current_font_map_width ) ;
  5134. $ch = hexdec ( $value ) ;
  5135.  
  5136. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
  5137. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
  5138. else
  5139. {
  5140. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
  5141. $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
  5142. }
  5143.  
  5144. $result .= $newchar ;
  5145. }
  5146.  
  5147. $handled = true ;
  5148. }
  5149. // Yes ! double-byte codes can also be specified as plain text within parentheses !
  5150. // However, we have to be really careful here ; the sequence :
  5151. // (Be)
  5152. // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
  5153. // We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
  5154. // if not, then we have to consider that it is regular text to be taken one character by
  5155. // one character. In this case, we fall back to the "if ( ! $handled )" condition
  5156. else if ( $current_font_map_width == 4 )
  5157. {
  5158. $temp_result = '' ;
  5159.  
  5160. for ( $i = 1 ; $i < $length ; $i ++ )
  5161. {
  5162. // Each character in the pair may be a backslash, which escapes the next character so we must skip it
  5163. // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
  5164. if ( $text [$i] != '\\' )
  5165. $ch1 = $text [$i] ;
  5166. else
  5167. {
  5168. $i ++ ;
  5169.  
  5170. if ( $text [$i] < '0' || $text [$i] > '7' )
  5171. $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
  5172. else
  5173. {
  5174. $oct = '' ;
  5175. $digit_count = 0 ;
  5176.  
  5177. while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
  5178. {
  5179. $oct .= $text [$i ++] ;
  5180. $digit_count ++ ;
  5181. }
  5182.  
  5183. $ch1 = chr ( octdec ( $oct ) ) ;
  5184. $i -- ;
  5185. }
  5186. }
  5187.  
  5188. $i ++ ;
  5189.  
  5190. if ( $text [$i] != '\\' )
  5191. $ch2 = $text [$i] ;
  5192. else
  5193. {
  5194. $i ++ ;
  5195.  
  5196. if ( $text [$i] < '0' || $text [$i] > '7' )
  5197. $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ;
  5198. else
  5199. {
  5200. $oct = '' ;
  5201. $digit_count = 0 ;
  5202.  
  5203. while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 )
  5204. {
  5205. $oct .= $text [$i ++] ;
  5206. $digit_count ++ ;
  5207. }
  5208.  
  5209. $ch2 = chr ( octdec ( $oct ) ) ;
  5210. $i -- ;
  5211. }
  5212. }
  5213.  
  5214. // Build the 2-bytes character code
  5215. $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ;
  5216.  
  5217. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
  5218. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
  5219. else
  5220. {
  5221. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
  5222. $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ;
  5223. }
  5224.  
  5225. // Yes !!! for characters encoded with two bytes, we can find the following construct :
  5226. // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
  5227. // which must be expanded as : (Car)
  5228. // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
  5229. // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
  5230. // for the next quirk to happen...
  5231. if ( $newchar == '\\' )
  5232. {
  5233. $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
  5234. $i ++ ; // this time we processed 3 bytes, not 2
  5235. }
  5236.  
  5237. $temp_result .= $newchar ;
  5238. }
  5239.  
  5240. // Happens only if we were unable to translate a character using the current character map
  5241. $result .= $temp_result ;
  5242. $handled = true ;
  5243. }
  5244.  
  5245. // Character strings within parentheses.
  5246. // For every text value, use the character map table for substitutions
  5247. if ( ! $handled )
  5248. {
  5249. for ( $i = 1 ; $i < $length ; $i ++ )
  5250. {
  5251. $ch = $text [$i] ;
  5252.  
  5253. // Set to true to optimize calls to MapCharacters
  5254. // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
  5255. $use_map_buffer = false ;
  5256.  
  5257. // ... but don't forget to handle escape sequences "\n" and "\r" for characters
  5258. // 10 and 13
  5259. if ( $ch == '\\' )
  5260. {
  5261. $ch = $text [++$i] ;
  5262.  
  5263. // Escaped character
  5264. if ( $ch < '0' || $ch > '7' )
  5265. $ch = $this -> ProcessEscapedCharacter ( $ch ) ;
  5266. // However, an octal form can also be specified ; in this case we have to take into account
  5267. // the character width for the current font (if the character width is 4 hex digits, then we
  5268. // will encounter constructs such as "\000\077").
  5269. // The method used here is dirty : we build a regex to match octal character representations on a substring
  5270. // of the text
  5271. else
  5272. {
  5273. $width = $current_font_map_width / 2 ; // Convert to byte count
  5274. $subtext = substr ( $text, $i - 1 ) ;
  5275. $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
  5276.  
  5277. $status = preg_match ( $regex, $subtext, $octal_matches ) ;
  5278.  
  5279. if ( $status )
  5280. {
  5281. $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
  5282. $ord = 0 ;
  5283.  
  5284. foreach ( $octal_values as $octal_value )
  5285. $ord = ( $ord << 8 ) + octdec ( $octal_value ) ;
  5286.  
  5287. $ch = chr ( $ord ) ;
  5288. $i += strlen ( $octal_matches [0] ) - 2 ;
  5289. }
  5290. }
  5291.  
  5292. $use_map_buffer = false ;
  5293. }
  5294.  
  5295. // Add substituted character to the output result
  5296. $ord = ord ( $ch ) ;
  5297.  
  5298. if ( ! $use_map_buffer )
  5299. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  5300. else
  5301. {
  5302. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
  5303. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
  5304. else
  5305. {
  5306. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  5307. $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
  5308. }
  5309. }
  5310.  
  5311. $result .= $newchar ;
  5312. }
  5313. }
  5314.  
  5315. // Handle offsets between blocks of characters
  5316. if ( isset ( $offsets [ $value_index ] ) &&
  5317. - ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth )
  5318. $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
  5319.  
  5320. $value_index ++ ;
  5321. }
  5322. }
  5323. // For fonts having no associated character map, we simply encode the string in UTF8
  5324. // after the C-like escape sequences have been processed
  5325. // Note that <xxxx> constructs can be encountered here, so we have to process them as well
  5326. else
  5327. {
  5328. foreach ( $text_values as $text )
  5329. {
  5330. $is_hex = ( $text [0] == '<' ) ;
  5331. $length = strlen ( $text ) - 1 ;
  5332.  
  5333. // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
  5334. // Example :
  5335. // (this is a sentence \
  5336. // continued on the next line)
  5337. // Funny isn't it ? so remove such constructs because we don't care
  5338. $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
  5339.  
  5340. // Characters are encoded within angle brackets ( "<>" )
  5341. if ( $is_hex )
  5342. {
  5343. for ( $i = 1 ; $i < $length ; $i += 2 )
  5344. {
  5345. $ch = hexdec ( substr ( $text, $i, 2 ) ) ;
  5346.  
  5347. $result .= $this -> CodePointToUtf8 ( $ch ) ;
  5348. }
  5349. }
  5350. // Characters are plain text
  5351. else
  5352. {
  5353. $text = self::Unescape ( $text ) ;
  5354.  
  5355. for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ )
  5356. {
  5357. $ch = $text [$i] ;
  5358. $ord = ord ( $ch ) ;
  5359.  
  5360. if ( $ord < 127 )
  5361. $newchar = $ch ;
  5362. else
  5363. {
  5364. if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
  5365. $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
  5366. else
  5367. {
  5368. $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
  5369. $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ;
  5370. }
  5371. }
  5372.  
  5373. $result .= $newchar ;
  5374. }
  5375. }
  5376.  
  5377. // Handle offsets between blocks of characters
  5378. if ( isset ( $offsets [ $value_index ] ) &&
  5379. abs ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth )
  5380. $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
  5381.  
  5382. $value_index ++ ;
  5383. }
  5384. }
  5385.  
  5386. // All done, return
  5387. return ( $result ) ;
  5388. }
  5389.  
  5390.  
  5391. // __assemble_text_fragments -
  5392. // Assembles text fragments collected by the ExtractTextWithLayout function.
  5393. private function __assemble_text_fragments ( $page_number, &$fragments, &$page_width, &$page_height )
  5394. {
  5395. $fragment_count = count ( $fragments ) ;
  5396.  
  5397. // No fragment no cry...
  5398. if ( ! $fragment_count )
  5399. return ( '' ) ;
  5400.  
  5401. // Compute the width of each fragment
  5402. foreach ( $fragments as &$fragment )
  5403. $this -> __compute_fragment_width ( $fragment ) ;
  5404.  
  5405. // Sort the fragments and group them by line
  5406. usort ( $fragments, array ( $this, '__sort_page_fragments' ) ) ;
  5407. $line_fragments = $this -> __group_line_fragments ( $fragments ) ;
  5408.  
  5409. // Retrieve the page attributes
  5410. $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ;
  5411.  
  5412. // Some buggy PDF do not specify page width or page height so, during the processing of text fragments,
  5413. // page width & height will be set to the largest x/y coordinate
  5414. if ( isset ( $page_attributes [ 'width' ] ) && $page_attributes [ 'width' ] )
  5415. $page_width = $page_attributes [ 'width' ] ;
  5416. else
  5417. {
  5418. $page_width = 0 ;
  5419.  
  5420. foreach ( $fragments as $fragment )
  5421. {
  5422. $end_x = $fragment [ 'x' ] + $fragment [ 'width' ] ;
  5423.  
  5424. if ( $end_x > $page_width )
  5425. $page_width = $end_x ;
  5426. }
  5427. }
  5428.  
  5429. if ( isset ( $page_attributes [ 'height' ] ) && $page_attributes [ 'height' ] )
  5430. $page_height = $page_attributes [ 'height' ] ;
  5431. else
  5432. $page_height = $fragments [0] [ 'y' ] ;
  5433.  
  5434. // Block separator
  5435. $separator = ( $this -> BlockSeparator ) ? $this -> BlockSeparator : ' ' ;
  5436.  
  5437. // Unprocessed marker count
  5438. $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
  5439.  
  5440. // Add page information if the PDFOPT_DEBUG_SHOW_COORDINATES option has been specified
  5441. if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
  5442. $result = "[Page : $page_number, width = $page_width, height = $page_height]" . $this -> EOL ;
  5443. else
  5444. $result = '' ;
  5445.  
  5446. // Loop through each line of fragments
  5447. for ( $i = 0, $line_count = count ( $line_fragments ) ; $i < $line_count ; $i ++ )
  5448. {
  5449. $current_x = 0 ;
  5450.  
  5451. // Loop through each fragment of the current line
  5452. for ( $j = 0, $fragment_count = count ( $line_fragments [$i] ) ; $j < $fragment_count ; $j ++ )
  5453. {
  5454. $fragment = $line_fragments [$i] [$j] ;
  5455.  
  5456. // Process the markers which do not have an associated font yet - this will be done by matching
  5457. // the current text fragment against one of the regular expressions defined.
  5458. // If a match occurs, then all the subsequent text fragment using the same font will be put markers
  5459. for ( $k = 0 ; $k < $unprocessed_marker_count ; $k ++ )
  5460. {
  5461. $marker = $this -> UnprocessedMarkerList [ 'font' ] [$k] ;
  5462.  
  5463. if ( preg_match ( $marker [ 'regex' ], $fragment [ 'text' ] ) )
  5464. {
  5465. $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] = array
  5466. (
  5467. 'font' => $fragment [ 'font' ],
  5468. 'height' => $fragment [ 'font-height' ],
  5469. 'regex' => $marker [ 'regex' ],
  5470. 'start' => $marker [ 'start' ],
  5471. 'end' => $marker [ 'end' ]
  5472. ) ;
  5473.  
  5474. $unprocessed_marker_count -- ;
  5475. unset ( $this -> UnprocessedMarkerList [ 'font' ] [$k] ) ;
  5476.  
  5477. break ;
  5478. }
  5479. }
  5480.  
  5481. // Add debug info if needed
  5482. if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
  5483. $result .= $this -> __debug_get_coordinates ( $fragment ) ;
  5484.  
  5485. // Add a separator between two fragments, if needed
  5486. if ( $j )
  5487. {
  5488. if ( $current_x < floor ( $fragment [ 'x' ] ) ) // Accept small rounding errors
  5489. $result .= $separator ;
  5490. }
  5491.  
  5492. // Check if we need to add markers around this text fragment
  5493. if ( isset ( $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] ) &&
  5494. $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'height' ] == $fragment [ 'font-height' ] )
  5495. {
  5496. $fragment_text = $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'start' ] .
  5497. $fragment [ 'text' ] .
  5498. $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'end' ] ;
  5499. }
  5500. else
  5501. $fragment_text = $fragment [ 'text' ] ;
  5502.  
  5503. // Add the current fragment to the result
  5504. $result .= $fragment_text ;
  5505.  
  5506. // Update current x-position
  5507. $current_x = $fragment [ 'x' ] + $fragment [ 'width' ] ;
  5508. }
  5509.  
  5510. // Add a line break between each line
  5511. $result .= $this -> EOL ;
  5512. }
  5513.  
  5514. // All done, return
  5515. return ( $result ) ;
  5516. }
  5517.  
  5518.  
  5519. // __sort_page_fragments -
  5520. // Sorts page fragments by their (y,x) coordinates.
  5521. public function __sort_page_fragments ( $a, $b )
  5522. {
  5523. $xa = $a [ 'x' ] ;
  5524. $ya = $a [ 'y' ] ;
  5525. $xb = $b [ 'x' ] ;
  5526. $yb = $b [ 'y' ] ;
  5527.  
  5528. if ( $ya !== $yb )
  5529. return ( $yb - $ya ) ;
  5530. else
  5531. return ( $xa - $xb ) ;
  5532. }
  5533.  
  5534.  
  5535. // __sort_line_fragments -
  5536. // Sorts fragments per line.
  5537. public function __sort_line_fragments ( $a, $b )
  5538. {
  5539. return ( $a [ 'x' ] - $b [ 'x' ] ) ;
  5540. }
  5541.  
  5542.  
  5543. // __group_line_fragments -
  5544. // Groups page fragments per line, allowing a certain variation in the y-position.
  5545. private function __group_line_fragments ( $fragments )
  5546. {
  5547. $result = array ( ) ;
  5548. $fragment_count = count ( $fragments ) ;
  5549. $last_y_coordinate = $fragments [0] [ 'y' ] ;
  5550. $current_fragments = array ( $fragments [0] ) ;
  5551.  
  5552. for ( $i = 1 ; $i < $fragment_count ; $i ++ )
  5553. {
  5554. $fragment = $fragments [$i] ;
  5555.  
  5556. if ( $fragment [ 'y' ] + $fragment [ 'font-height' ] >= $last_y_coordinate )
  5557. $current_fragments [] = $fragment ;
  5558. else
  5559. {
  5560. $last_y_coordinate = $fragment [ 'y' ] ;
  5561. usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
  5562. $result [] = $current_fragments ;
  5563. $current_fragments = array ( $fragment ) ;
  5564. }
  5565. }
  5566.  
  5567. if ( count ( $current_fragments ) )
  5568. {
  5569. usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
  5570. $result [] = $current_fragments ;
  5571. }
  5572.  
  5573. return ( $result ) ;
  5574. }
  5575.  
  5576.  
  5577. // __compute_fragment_width -
  5578. // Compute the width of the specified text fragment and add the width entry accordingly.
  5579. // Returns the font object associated with this fragment
  5580. private function __compute_fragment_width ( &$fragment )
  5581. {
  5582. // To avoid repeated calls to the PdfTexterFontTable::GetFontObject() method, we are buffering them in the FontObjectsBuffer property.
  5583. $object_reference = $fragment [ 'page' ] . ':' . $fragment [ 'template' ] . ':' . $fragment [ 'font' ] ;
  5584.  
  5585. if ( isset ( $this -> FontObjectsBuffer [ $object_reference ] ) )
  5586. $font_object = $this -> FontObjectsBuffer [ $object_reference ] ;
  5587. else
  5588. {
  5589. $font_object = $this -> FontTable -> GetFontObject ( $fragment [ 'page' ], $fragment [ 'template' ], $fragment [ 'font' ] ) ;
  5590. $this -> FontObjectsBuffer [ $object_reference ] = $font_object ;
  5591. }
  5592.  
  5593. // The width of the previous text fragment will be computed only if its associated font contains character widths information
  5594. $fragment [ 'width' ] = ( $font_object ) ? $font_object -> GetStringWidth ( $fragment [ 'text' ], $this -> ExtraTextWidth ) : 0 ;
  5595.  
  5596. // Return the font object
  5597. return ( $font_object ) ;
  5598. }
  5599.  
  5600.  
  5601. // __debug_get_coordinates -
  5602. // Returns the coordinates of the specified text fragment, in debug mode.
  5603. private function __debug_get_coordinates ( $fragment )
  5604. {
  5605. return ( "\n[x:" . round ( $fragment [ 'x' ], 3 ) . ', y:' . round ( $fragment [ 'y' ], 3 ) .
  5606. ", w: " . round ( $fragment [ 'width' ], 3 ) . ", h:" . round ( $fragment [ 'font-height' ], 3 ) . ", font:" . $fragment [ 'font' ] . "]" ) ;
  5607. }
  5608.  
  5609.  
  5610. /*--------------------------------------------------------------------------------------------------------------
  5611.  
  5612. NAME
  5613. GetTrailerInformation - Retrieves trailer information.
  5614.  
  5615. PROTOTYPE
  5616. $this -> GetTrailerInformation ( $contents ) ;
  5617.  
  5618. DESCRIPTION
  5619. Retrieves trailer information :
  5620. - Unique file ID
  5621. - Id of the object containing encryption data, if the PDF file is encrypted
  5622. - Encryption data
  5623.  
  5624. PARAMETERS
  5625. $contents (string) -
  5626. PDF file contents.
  5627.  
  5628. *-------------------------------------------------------------------------------------------------------------*/
  5629. protected function GetTrailerInformation ( $contents, $pdf_objects )
  5630. {
  5631. // Be paranoid : check if there is trailer information
  5632. if ( ! preg_match ( '/trailer \s* << (?P<trailer> .+?) >>/imsx', $contents, $trailer_match ) )
  5633. return ;
  5634.  
  5635. $trailer_data = $trailer_match [ 'trailer' ] ;
  5636.  
  5637. // Get the unique file id from the trailer data
  5638. static $id_regex = '#
  5639. /ID \s* \[ \s*
  5640. < (?P<id1> [^>]+) >
  5641. \s*
  5642. < (?P<id2> [^>]+) >
  5643. \s* \]
  5644. #imsx' ;
  5645.  
  5646. if ( preg_match ( $id_regex, $trailer_data, $id_match ) )
  5647. {
  5648. $this -> ID = $id_match [ 'id1' ] ;
  5649. $this -> ID2 = $id_match [ 'id2' ] ;
  5650. }
  5651.  
  5652. // If there is an object describing encryption data, get its number (/Encrypt flag)
  5653. if ( ! preg_match ( '#/Encrypt \s+ (?P<object> \d+)#ix', $trailer_data, $encrypt_match ) )
  5654. return ;
  5655.  
  5656. $encrypt_object_id = $encrypt_match [ 'object' ] ;
  5657.  
  5658. if ( ! isset ( $pdf_objects [ $encrypt_object_id ] ) )
  5659. {
  5660. if ( self::$DEBUG )
  5661. error ( new PdfToTextDecodingException ( "Object #$encrypt_object_id, which should contain encryption data, is missing." ) ) ;
  5662.  
  5663. return ;
  5664. }
  5665.  
  5666. // Parse encryption information
  5667. $this -> EncryptionData = PdfEncryptionData::GetInstance ( $this -> ID, $encrypt_object_id, $pdf_objects [ $encrypt_object_id ] ) ;
  5668. $this -> IsEncrypted = ( $this -> EncryptionData !== false ) ;
  5669. }
  5670.  
  5671.  
  5672. // __build_ignored_instructions :
  5673. // Takes the template regular expressions from the self::$IgnoredInstructionsTemplates, replace each string with the contents
  5674. // of the self::$ReplacementConstructs array, and sets the self::$IgnoredInstructions to a regular expression that is able to
  5675. // match the Postscript instructions to be removed from any text stream.
  5676. private function __build_ignored_instructions ( )
  5677. {
  5678. $searches = array_keys ( self::$ReplacementConstructs ) ;
  5679. $replacements = array_values ( self::$ReplacementConstructs ) ;
  5680.  
  5681. foreach ( self::$IgnoredInstructionTemplatesLayout as $template )
  5682. {
  5683. $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
  5684.  
  5685. self::$IgnoredInstructionsLayout [] = $template ;
  5686. self::$IgnoredInstructionsNoLayout [] = $template ;
  5687. }
  5688.  
  5689. foreach ( self::$IgnoredInstructionTemplatesNoLayout as $template )
  5690. {
  5691. $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
  5692.  
  5693. self::$IgnoredInstructionsNoLayout [] = $template ;
  5694. }
  5695. }
  5696.  
  5697.  
  5698. // __convert_utf16 :
  5699. // Some strings found in a pdf file can be encoded in UTF16 (author information, for example).
  5700. // When this is the case, the string is converted to UTF8.
  5701. private function __convert_utf16 ( $text )
  5702. {
  5703. if ( isset ( $text [0] ) && isset ( $text [1] ) )
  5704. {
  5705. $b1 = ord ( $text [0] ) ;
  5706. $b2 = ord ( $text [1] ) ;
  5707.  
  5708. if ( ( $b1 == 0xFE && $b2 == 0xFF ) || ( $b1 == 0xFF && $b2 == 0xFE ) )
  5709. $text = mb_convert_encoding ( $text, 'UTF-8', 'UTF-16' ) ;
  5710. }
  5711.  
  5712. return ( $text ) ;
  5713. }
  5714.  
  5715.  
  5716. // __extract_chars_from_array -
  5717. // Extracts characters enclosed either within parentheses (character codes) or angle brackets (hex value)
  5718. // from an array.
  5719. // Example :
  5720. //
  5721. // [<0D>-40<02>-36<03>-39<0E>-36<0F>-36<0B>-37<10>-37<10>-35(abc)]
  5722. //
  5723. // will return an array having the following entries :
  5724. //
  5725. // <0D>, <02>, <03>, <0E>, <0F>, <0B>, <10>, <10>, (abc)
  5726. private function __extract_chars_from_array ( $array )
  5727. {
  5728. $length = strlen ( $array ) - 1 ;
  5729. $result = array ( ) ;
  5730. $offsets = array ( ) ;
  5731.  
  5732. for ( $i = 1 ; $i < $length ; $i ++ ) // Start with character right after the opening bracket
  5733. {
  5734. $ch = $array [$i] ;
  5735.  
  5736. if ( $ch == '(' )
  5737. $endch = ')' ;
  5738. else if ( $ch == '<' )
  5739. $endch = '>' ;
  5740. else
  5741. {
  5742. $value = '' ;
  5743.  
  5744. while ( $i < $length && ( ( $array [$i] >= '0' && $array [$i] <= '9' ) ||
  5745. $array [$i] == '-' || $array [$i] == '+' || $array [$i] == '.' ) )
  5746. $value .= $array [$i++] ;
  5747.  
  5748. $offsets [] = ( double ) $value ;
  5749.  
  5750. if ( $value !== '' )
  5751. $i -- ;
  5752.  
  5753. continue ;
  5754. }
  5755.  
  5756. $char = $ch ;
  5757. $i ++ ;
  5758.  
  5759. while ( $i < $length && $array [$i] != $endch )
  5760. {
  5761. if ( $array [$i] == '\\' )
  5762. $char .= '\\' . $array [++$i] ;
  5763. else
  5764. {
  5765. $char .= $array [$i] ;
  5766.  
  5767. if ( $array [$i] == $endch )
  5768. break ;
  5769. }
  5770.  
  5771. $i ++ ;
  5772. }
  5773.  
  5774. $result [] = $char . $endch ;
  5775. }
  5776.  
  5777. return ( array ( $result, $offsets ) ) ;
  5778. }
  5779.  
  5780.  
  5781. // __extract_chars_from_block -
  5782. // Extracts characters from a text block (enclosed in parentheses).
  5783. // Returns an array of character ordinals if the $as_array parameter is true, or a string if false.
  5784. private function __extract_chars_from_block ( $text, $start_index = false, $length = false, $as_array = false )
  5785. {
  5786. if ( $as_array )
  5787. $result = array ( ) ;
  5788. else
  5789. $result = '' ;
  5790.  
  5791. if ( $start_index === false )
  5792. $start_index = 0 ;
  5793.  
  5794. if ( $length === false )
  5795. $length = strlen ( $text ) ;
  5796.  
  5797. $ord0 = ord ( '0' ) ;
  5798.  
  5799. for ( $i = $start_index ; $i < $length ; $i ++ )
  5800. {
  5801. $ch = $text [$i] ;
  5802.  
  5803. if ( $ch == '\\' )
  5804. {
  5805. if ( isset ( $text [ $i + 1 ] ) )
  5806. {
  5807. $ch2 = $text [ ++$i ] ;
  5808.  
  5809. switch ( $ch2 )
  5810. {
  5811. case 'n' : $ch = "\n" ; break ;
  5812. case 'r' : $ch = "\r" ; break ;
  5813. case 't' : $ch = "\t" ; break ;
  5814. case 'f' : $ch = "\f" ; break ;
  5815. case 'v' : $ch = "\v" ; break ;
  5816.  
  5817. default :
  5818. if ( $ch2 >= '0' && $ch2 <= '7' )
  5819. {
  5820. $ord = $ch2 - $ord0 ;
  5821. $i ++ ;
  5822.  
  5823. while ( isset ( $text [$i] ) && $text [$i] >= '0' && $text [$i] <= '7' )
  5824. {
  5825. $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
  5826. $i ++ ;
  5827. }
  5828.  
  5829. $ch = chr ( $ord ) ;
  5830. $i -- ;
  5831. }
  5832. else
  5833. $ch = $ch2 ;
  5834.  
  5835. }
  5836. }
  5837. }
  5838.  
  5839. if ( $as_array )
  5840. $result [] = ord ( $ch ) ;
  5841. else
  5842. $result .= $ch ;
  5843. }
  5844.  
  5845. return ( $result ) ;
  5846. }
  5847.  
  5848.  
  5849. // __get_character_padding :
  5850. // If the offset specified between two character groups in an array notation for displaying text is less
  5851. // than -MinSpaceWidth thousands of text units,
  5852. private function __get_character_padding ( $char_offset )
  5853. {
  5854. if ( $char_offset <= - $this -> MinSpaceWidth )
  5855. {
  5856. if ( $this -> Options & self::PDFOPT_REPEAT_SEPARATOR )
  5857. {
  5858. // If the MinSpaceWidth property is less than 1000 (text units), consider it has the value 1000
  5859. // so that an exuberant number of spaces will not be repeated
  5860. $space_width = ( $this -> MinSpaceWidth < 1000 ) ? 1000 : $this -> MinSpaceWidth ;
  5861.  
  5862. $repeat_count = abs ( round ( $char_offset / $space_width, 0 ) ) ;
  5863.  
  5864. if ( $repeat_count )
  5865. $padding = str_repeat ( $this -> Separator, $repeat_count ) ;
  5866. else
  5867. $padding = $this -> Separator ;
  5868. }
  5869. else
  5870. $padding = $this -> Separator ;
  5871.  
  5872. return ( utf8_encode ( self::Unescape ( $padding ) ) ) ;
  5873. }
  5874. else
  5875. return ( '' ) ;
  5876. }
  5877.  
  5878.  
  5879. // __get_output_image_filename -
  5880. // Returns a real filename based on a template supplied by the AutoSaveImageFileTemplate property.
  5881. private function __get_output_image_filename ( )
  5882. {
  5883. static $suffixes = array
  5884. (
  5885. IMG_JPEG => 'jpg',
  5886. IMG_JPG => 'jpg',
  5887. IMG_GIF => 'gif',
  5888. IMG_PNG => 'png',
  5889. IMG_WBMP => 'wbmp',
  5890. IMG_XPM => 'xpm'
  5891. ) ;
  5892.  
  5893. $template = $this -> ImageAutoSaveFileTemplate ;
  5894. $length = strlen ( $template ) ;
  5895. $parts = pathinfo ( $this -> Filename ) ;
  5896.  
  5897. if ( ! isset ( $parts [ 'filename' ] ) ) // for PHP versions < 5.2
  5898. {
  5899. $index = strpos ( $parts [ 'basename' ], '.' ) ;
  5900.  
  5901. if ( $index === false )
  5902. $parts [ 'filename' ] = $parts [ 'basename' ] ;
  5903. else
  5904. $parts [ 'filename' ] = substr ( $parts [ 'basename' ], $index ) ;
  5905. }
  5906.  
  5907. $searches = array ( ) ;
  5908. $replacements = array ( ) ;
  5909.  
  5910. // Search for each construct starting with '%'
  5911. for ( $i = 0 ; $i < $length ; $i ++ )
  5912. {
  5913. if ( $template [$i] != '%' || $i + 1 >= $length )
  5914. continue ;
  5915.  
  5916. $ch = $template [ ++ $i ] ;
  5917.  
  5918. // Percent sign found : check the character after
  5919. switch ( $ch )
  5920. {
  5921. // "%%" : Replace it with a single percent
  5922. case '%' :
  5923. $searches [] = '%%' ;
  5924. $replacements [] = '%' ;
  5925. break ;
  5926.  
  5927. // "%p" : Path of the original PDF file
  5928. case 'p' :
  5929. $searches [] = '%p' ;
  5930. $replacements [] = $parts [ 'dirname' ] ;
  5931. break ;
  5932.  
  5933. // "%f" : Filename part of the original PDF file, without its suffix
  5934. case 'f' :
  5935. $searches [] = '%f' ;
  5936. $replacements [] = $parts [ 'filename' ] ;
  5937. break ;
  5938.  
  5939. // "%s" : Output image file suffix, determined by the ImageAutoSaveFormat property
  5940. case 's' :
  5941. if ( isset ( $suffixes [ $this -> ImageAutoSaveFormat ] ) )
  5942. {
  5943. $searches [] = '%s' ;
  5944. $replacements [] = $suffixes [ $this -> ImageAutoSaveFormat ] ;
  5945. }
  5946. else
  5947. {
  5948. $searches [] = '%s' ;
  5949. $replacements [] = 'unknown' ;
  5950. }
  5951.  
  5952. break ;
  5953.  
  5954. // Other : may be either "%d", or "%xd", where "x" are digits expression the width of the final sequential index
  5955. default :
  5956. $width = 0 ;
  5957. $chars = '' ;
  5958.  
  5959. if ( ctype_digit ( $ch ) )
  5960. {
  5961. do
  5962. {
  5963. $width = ( $width * 10 ) + ord ( $ch ) - ord ( '0' ) ;
  5964. $chars .= $ch ;
  5965. $i ++ ;
  5966. } while ( $i < $length && ctype_digit ( $ch = $template [$i] ) ) ;
  5967.  
  5968. if ( $template [$i] == 'd' )
  5969. {
  5970. $searches [] = '%' . $chars . 'd' ;
  5971. $replacements [] = sprintf ( "%0{$width}d", $this -> ImageCount ) ;
  5972. }
  5973. }
  5974. else
  5975. {
  5976. $searches [] = '%d' ;
  5977. $replacements [] = $this -> ImageCount ;
  5978. }
  5979. }
  5980. }
  5981.  
  5982. // Perform the replacements
  5983. if ( count ( $searches ) )
  5984. $result = str_replace ( $searches, $replacements, $template ) ;
  5985. else
  5986. $result = $template ;
  5987.  
  5988. // All done, return
  5989. return ( $result ) ;
  5990. }
  5991.  
  5992.  
  5993. // __rtl_process -
  5994. // Processes the contents of a page when it contains characters belonging to an RTL language.
  5995. private function __rtl_process ( $text )
  5996. {
  5997. $length = strlen ( $text ) ;
  5998. $pos = strcspn ( $text, self::$RtlCharacterPrefixes ) ;
  5999.  
  6000. // The text does not contain any of the UTF-8 prefixes that may introduce RTL contents :
  6001. // simply return it as is
  6002. if ( $pos == $length || $text [$pos] === "\x00" )
  6003. return ( $text ) ;
  6004.  
  6005. // Extract each individual line, and get rid of carriage returns if any
  6006. $lines = explode ( "\n", str_replace ( "\r", '', $text ) ) ;
  6007. $new_lines = array ( ) ;
  6008.  
  6009. // Loop through lines
  6010. foreach ( $lines as $line )
  6011. {
  6012. // Check if the current line contains potential RTL characters
  6013. $pos = strcspn ( $line, self::$RtlCharacterPrefixes ) ;
  6014. $length = strlen ( $line ) ;
  6015.  
  6016. // If not, simply store it as is
  6017. if ( $pos == $length )
  6018. {
  6019. $new_lines [] = $line ;
  6020. continue ;
  6021. }
  6022.  
  6023. // Otherwise, it gets a little bit more complicated ; we have :
  6024. // - To process each series of RTL characters and put them in reverse order
  6025. // - Mark spaces and punctuation as "RTL separators", without reversing them (ie, a string like " ." remains " .", not ". ")
  6026. // - Other sequences of non-RTL characters must be preserved as is and are not subject to reordering
  6027. // The reordering sequence will be described later. For the moment, the $words array is used to store arrays of two elements :
  6028. // - The first one is a boolean indicating whether it concerns RTL characters (true) or not (false)
  6029. // - The second one is the string itself
  6030. $words = array ( ) ;
  6031.  
  6032. // Start of the string is not an RTL sequence ; we can add it to our $words array
  6033. if ( $pos )
  6034. {
  6035. $word = substr ( $line, 0, $pos ) ;
  6036. $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ;
  6037. }
  6038.  
  6039. $in_rtl = true ;
  6040.  
  6041. // Loop through remaining characters of the current line
  6042. while ( $pos < $length )
  6043. {
  6044. // Character at the current position may be RTL character
  6045. if ( $in_rtl )
  6046. {
  6047.  
  6048. $rtl_text = '' ;
  6049. $rtl_char = '' ;
  6050. $rtl_char_length = 0 ;
  6051. $found_rtl = false ;
  6052.  
  6053. // Collect all the consecutive RTL characters, which represent a word, and put the letters in reverse order
  6054. while ( $pos < $length && $this -> __is_rtl_character ( $line, $pos, $rtl_char, $rtl_char_length ) )
  6055. {
  6056. $rtl_text = $rtl_char . $rtl_text ;
  6057. $pos += $rtl_char_length ;
  6058. $found_rtl = true ;
  6059. }
  6060.  
  6061. // ... but make sure that we found a valid RTL sequence
  6062. if ( $found_rtl )
  6063. $words [] = array ( true, $rtl_text ) ;
  6064. else
  6065. $words [] = array ( false, $line [ $pos ++ ] ) ;
  6066.  
  6067. // For now, we are no more in a series of RTL characters
  6068. $in_rtl = false ;
  6069. }
  6070. // Non-RTL characters : collect them until either the end of the current line or the next RTL character
  6071. else
  6072. {
  6073. $next_pos = $pos + strcspn ( $line, self::$RtlCharacterPrefixes, $pos ) ;
  6074.  
  6075. if ( $next_pos >= $length )
  6076. {
  6077. $word = substr ( $line, $pos ) ;
  6078. break ;
  6079. }
  6080. else
  6081. {
  6082. $word = substr ( $line, $pos, $next_pos - $pos ) ;
  6083. $pos = $next_pos ;
  6084. $in_rtl = true ;
  6085. }
  6086.  
  6087. // Don't forget to make the distinction between a sequence of spaces and punctuations, and a real
  6088. // piece of text. Space/punctuation strings surrounded by RTL words will be interverted
  6089. $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ;
  6090. }
  6091. }
  6092.  
  6093. // Now we have an array, $words, whose first entry of each element indicates whether the second entry is an RTL string
  6094. // or not (this includes strings that contain only spaces and punctuation).
  6095. // We have to gather all the consecutive array items whose first entry is true, then invert their order.
  6096. // Non-RTL strings are not affected by this process.
  6097. $stacked_rtl_words = array ( ) ;
  6098. $new_words = array ( ) ;
  6099.  
  6100. foreach ( $words as $word )
  6101. {
  6102. // RTL word : put it onto the stack
  6103. if ( $word [0] )
  6104. $stacked_rtl_words [] = $word [1] ;
  6105. // Non-RTL word : add it as is to the output array, $new_words
  6106. else
  6107. {
  6108. // But if RTL words were stacked before, invert them and add them to the output array
  6109. if ( count ( $stacked_rtl_words ) )
  6110. {
  6111. $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
  6112. $stacked_rtl_words = array ( ) ;
  6113. }
  6114.  
  6115. $new_words [] = $word [1] ;
  6116. }
  6117. }
  6118.  
  6119. // Process any remaining RTL words that may have been stacked and not yet processed
  6120. if ( count ( $stacked_rtl_words ) )
  6121. $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
  6122.  
  6123. // That's ok, we have processed one more line
  6124. $new_lines [] = implode ( '', $new_words ) ;
  6125. }
  6126.  
  6127. // All done, return a catenation of all the lines processed so far
  6128. $result = implode ( "\n", $new_lines ) ;
  6129.  
  6130. return ( $result ) ;
  6131. }
  6132.  
  6133.  
  6134. // __is_rtl_character -
  6135. // Checks if the sequence starting at $pos in string $text is a character belonging to an RTL language.
  6136. // If yes, returns true and sets $rtl_char to the UTF8 string sequence for that character, and $rtl_char_length
  6137. // to the length of this string.
  6138. // If no, returns false.
  6139. private function __is_rtl_character ( $text, $pos, &$rtl_char, &$rtl_char_length )
  6140. {
  6141. $ch = $text [ $pos ] ;
  6142.  
  6143. // Check that the current character is the start of a potential UTF8 RTL sequence
  6144. if ( isset ( self::$RtlCharacterPrefixLengths [ $ch ] ) )
  6145. {
  6146. // Get the number of characters that are expected after the sequence
  6147. $length_after = self::$RtlCharacterPrefixLengths [ $ch ] ;
  6148.  
  6149. // Get the sequence after the UTF8 prefix
  6150. $codes_after = substr ( $text, $pos + 1, $length_after ) ;
  6151.  
  6152. // Search through $RtlCharacters, which contains arrays of ranges related to the UTF8 character prefix
  6153. foreach ( self::$RtlCharacters [ $ch ] as $range )
  6154. {
  6155. if ( strcmp ( $range [0], $codes_after ) <= 0 &&
  6156. strcmp ( $range [1], $codes_after ) >= 0 )
  6157. {
  6158. $rtl_char = $ch . $codes_after ;
  6159. $rtl_char_length = $length_after + 1 ;
  6160.  
  6161. return ( true ) ;
  6162. }
  6163. }
  6164.  
  6165. return ( false ) ;
  6166. }
  6167. else
  6168. return ( false ) ;
  6169. }
  6170.  
  6171.  
  6172. // __is_rtl_separator -
  6173. // RTL words are separated by spaces and punctuation signs that are specified as LTR characters.
  6174. // However, such sequences, which are separators between words, must be considered as being part
  6175. // of an RTL sequence of words and therefore be reversed with them.
  6176. // This function helps to determine if the supplied string is simply a sequence of spaces and
  6177. // punctuation (a word separator) or plain text, that must keep its position in the line.
  6178. private function __is_rtl_separator ( $text )
  6179. {
  6180. static $known_separators = array ( ) ;
  6181. static $separators = " \t,.;:/!-_=+" ;
  6182.  
  6183. if ( isset ( $known_separators [ $text ] ) )
  6184. return ( true ) ;
  6185.  
  6186. for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ )
  6187. {
  6188. if ( strpos ( $separators, $text [$i] ) === false )
  6189. return ( false ) ;
  6190. }
  6191.  
  6192. $known_separators [ $text ] = true ;
  6193.  
  6194. return ( true ) ;
  6195. }
  6196.  
  6197.  
  6198. // __strip_useless_instructions :
  6199. // Removes from a text stream all the Postscript instructions that are not meaningful for text extraction
  6200. // (these are mainly shape drawing instructions).
  6201. private function __strip_useless_instructions ( $data )
  6202. {
  6203. $result = preg_replace ( $this -> IgnoredInstructions, ' ', $data ) ;
  6204.  
  6205. $this -> Statistics [ 'TextSize' ] += strlen ( $data ) ;
  6206. $this -> Statistics [ 'OptimizedTextSize' ] += strlen ( $result ) ;
  6207.  
  6208. return ( $result ) ;
  6209. }
  6210.  
  6211.  
  6212. /*--------------------------------------------------------------------------------------------------------------
  6213.  
  6214. NAME
  6215. IsPageSelected - Checks if a page is selected for output.
  6216.  
  6217. PROTOTYPE
  6218. $status = $this -> IsPageSelected ( $page ) ;
  6219.  
  6220. DESCRIPTION
  6221. Checks if the specified page is to be selected for output.
  6222.  
  6223. PARAMETERS
  6224. $page (integer) -
  6225. Page to be checked.
  6226.  
  6227. RETURN VALUE
  6228. True if the page is to be selected for output, false otherwise.
  6229.  
  6230. *-------------------------------------------------------------------------------------------------------------*/
  6231. protected function IsPageSelected ( $page )
  6232. {
  6233. if ( ! $this -> MaxSelectedPages )
  6234. return ( true ) ;
  6235.  
  6236. if ( $this -> MaxSelectedPages > 0 )
  6237. return ( $page <= $this -> MaxSelectedPages ) ;
  6238.  
  6239. // MaxSelectedPages < 0
  6240. return ( $page > count ( $this -> PageMap -> Pages ) + $this -> MaxSelectedPages ) ;
  6241. }
  6242.  
  6243.  
  6244. /*--------------------------------------------------------------------------------------------------------------
  6245.  
  6246. NAME
  6247. PeekAuthorInformation - Gets author information from the specified object data.
  6248.  
  6249. PROTOTYPE
  6250. $this -> PeekAuthorInformation ( $object_id, $object_data ) ;
  6251.  
  6252. DESCRIPTION
  6253. Try to check if the specified object data contains author information (ie, the /Author, /Creator,
  6254. /Producer, /ModDate, /CreationDate keywords) and sets the corresponding properties accordingly.
  6255.  
  6256. PARAMETERS
  6257. $object_id (integer) -
  6258. Object id of this text block.
  6259.  
  6260. $object_data (string) -
  6261. Stream contents.
  6262.  
  6263. *-------------------------------------------------------------------------------------------------------------*/
  6264. protected function PeekAuthorInformation ( $object_id, $object_data )
  6265. {
  6266. if ( ( strpos ( $object_data, '/Author' ) !== false || strpos ( $object_data, '/CreationDate' ) !== false ) )
  6267. {
  6268. $this -> GotAuthorInformation = true ;
  6269. return ( $object_id ) ;
  6270. }
  6271. else
  6272. return ( false ) ;
  6273. }
  6274.  
  6275.  
  6276. /*--------------------------------------------------------------------------------------------------------------
  6277.  
  6278. NAME
  6279. RetrieveAuthorInformation - Extracts author information
  6280.  
  6281. PROTOTYPE
  6282. $this -> RetriveAuthorInformation ( $object_id, $pdf_objects ) ;
  6283.  
  6284. DESCRIPTION
  6285. Extracts the author information. Handles the case where flag values refer to existing objects.
  6286.  
  6287. PARAMETERS
  6288. $object_id (integer) -
  6289. Id of the object containing the author information.
  6290.  
  6291. $pdf_objects (array) -
  6292. Array whose keys are the PDF object ids, and values their corresponding contents.
  6293.  
  6294. *-------------------------------------------------------------------------------------------------------------*/
  6295. protected function RetrieveAuthorInformation ( $object_id, $pdf_objects )
  6296. {
  6297. static $re = '#
  6298. (?P<info>
  6299. /
  6300. (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
  6301. \s*
  6302. (?P<opening> [(<])
  6303. )
  6304. #imsx' ;
  6305. static $object_re = '#
  6306. (?P<info>
  6307. /
  6308. (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
  6309. \s*
  6310. (?P<object_ref>
  6311. (?P<object> \d+)
  6312. \s+
  6313. \d+
  6314. \s+
  6315. R
  6316. )
  6317. )
  6318. #imsx' ;
  6319.  
  6320. // Retrieve the object data corresponding to the specified object id
  6321. $object_data = $pdf_objects [ $object_id ] ;
  6322.  
  6323. // Pre-process flags whose values refer to existing objects
  6324. if ( preg_match_all ( $object_re, $object_data, $object_matches ) )
  6325. {
  6326. $searches = array ( ) ;
  6327. $replacements = array ( ) ;
  6328.  
  6329. for ( $i = 0, $count = count ( $object_matches [ 'keyword' ] ) ; $i < $count ; $i ++ )
  6330. {
  6331. $searches [] = $object_matches [ 'object_ref' ] [$i] ;
  6332.  
  6333. // Some buggy PDF may reference author information objects that do not exist
  6334. $replacements [] = isset ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) ?
  6335. trim ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) : '' ;
  6336. }
  6337.  
  6338. $object_data = str_replace ( $searches, $replacements, $object_data ) ;
  6339. }
  6340.  
  6341.  
  6342. // To execute faster, run the regular expression only if the object data contains a /Author keyword
  6343. if ( preg_match_all ( $re, $object_data, $matches, PREG_OFFSET_CAPTURE ) )
  6344. {
  6345. for ( $i = 0, $count = count ( $matches [ 'keyword' ] ) ; $i < $count ; $i ++ )
  6346. {
  6347. $keyword = $matches [ 'keyword' ] [$i] [0] ;
  6348. $opening = $matches [ 'opening' ] [$i] [0] ;
  6349. $start_index = $matches [ 'info' ] [$i] [1] + strlen ( $matches [ 'info' ] [$i] [0] ) ;
  6350.  
  6351. // Text between parentheses : the text is written as is
  6352. if ( $opening == '(' )
  6353. {
  6354. $parent_level = 1 ;
  6355.  
  6356. // Since the parameter value can contain any character, including "\" or "(", we will have to find the real closing
  6357. // parenthesis
  6358. $value = '' ;
  6359.  
  6360. for ( $j = $start_index, $object_length = strlen ( $object_data ) ; $j < $object_length ; $j ++ )
  6361. {
  6362. if ( $object_data [$j] == '\\' )
  6363. $value .= '\\' . $object_data [++$j] ;
  6364. else if ( $object_data [$j] == '(' )
  6365. {
  6366. $value .= '(' ;
  6367. $parent_level ++ ;
  6368. }
  6369. else if ( $object_data [$j] == ')' )
  6370. {
  6371. $parent_level -- ;
  6372.  
  6373. if ( ! $parent_level )
  6374. break ;
  6375. else
  6376. $value .= ')' ;
  6377. }
  6378. else
  6379. $value .= $object_data [$j] ;
  6380. }
  6381. }
  6382. // Text within angle brackets, written as hex digits
  6383. else
  6384. {
  6385. $end_index = strpos ( $object_data, '>', $start_index ) ;
  6386. $hexdigits = substr ( $object_data, $start_index, $end_index - $start_index ) ;
  6387. $value = hex2bin ( str_replace ( array ( "\n", "\r", "\t" ), '', $hexdigits ) ) ;
  6388. }
  6389.  
  6390. $value = $this -> __convert_utf16 ( $this -> __extract_chars_from_block ( $value ) ) ;
  6391.  
  6392. switch ( strtolower ( $keyword ) )
  6393. {
  6394. case 'author' : $this -> Author = $value ; break ;
  6395. case 'creator' : $this -> CreatorApplication = $value ; break ;
  6396. case 'producer' : $this -> ProducerApplication = $value ; break ;
  6397. case 'title' : $this -> Title = $value ; break ;
  6398. case 'keywords' : $this -> Keywords = $value ; break ;
  6399. case 'subject' : $this -> Subject = $value ; break ;
  6400. case 'creationdate' : $this -> CreationDate = $this -> GetUTCDate ( $value ) ; break ;
  6401. case 'moddate' : $this -> ModificationDate = $this -> GetUTCDate ( $value ) ; break ;
  6402. }
  6403. }
  6404.  
  6405. if ( self::$DEBUG )
  6406. {
  6407. echo "\n----------------------------------- AUTHOR INFORMATION\n" ;
  6408. echo ( "Author : " . $this -> Author . "\n" ) ;
  6409. echo ( "Creator application : " . $this -> CreatorApplication . "\n" ) ;
  6410. echo ( "Producer application : " . $this -> ProducerApplication . "\n" ) ;
  6411. echo ( "Title : " . $this -> Title . "\n" ) ;
  6412. echo ( "Subject : " . $this -> Subject . "\n" ) ;
  6413. echo ( "Keywords : " . $this -> Keywords . "\n" ) ;
  6414. echo ( "Creation date : " . $this -> CreationDate . "\n" ) ;
  6415. echo ( "Modification date : " . $this -> ModificationDate . "\n" ) ;
  6416. }
  6417. }
  6418. }
  6419.  
  6420.  
  6421. /*--------------------------------------------------------------------------------------------------------------
  6422.  
  6423. NAME
  6424. RetrieveFormData - Retrieves raw form data
  6425.  
  6426. PROTOTYPE
  6427. $this -> RetrieveFormData ( $object_id, $object_data ) ;
  6428.  
  6429. DESCRIPTION
  6430. Retrieves raw form data (form definition and field values definition).
  6431.  
  6432. PARAMETERS
  6433. $object_id (integer) -
  6434. Id of the object containing the author information.
  6435.  
  6436. $object_data (string) -
  6437. Object contents.
  6438.  
  6439. $pdf_objects (array) -
  6440. Array whose keys are the PDF object ids, and values their corresponding contents.
  6441.  
  6442. NOTES
  6443. This function only memorizes the contents of form data definitions. The actual data will be processed
  6444. only if the GetFormData() function is called.
  6445.  
  6446. *-------------------------------------------------------------------------------------------------------------*/
  6447. protected function RetrieveFormData ( $object_id, $object_data, $pdf_objects )
  6448. {
  6449. // Retrieve the object that contains the field values
  6450. preg_match ( '#\b R \s* \( \s* datasets \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $field_match ) ;
  6451. $field_object = $field_match [ 'object' ] ;
  6452.  
  6453. if ( ! isset ( $pdf_objects [ $field_object ] ) )
  6454. {
  6455. if ( self::$DEBUG )
  6456. warning ( "Field definitions object #$field_object not found in object #$object_id." ) ;
  6457.  
  6458. return ;
  6459. }
  6460.  
  6461. // Retrieve the object that contains the form definition
  6462. preg_match ( '#\b R \s* \( \s* form \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $form_match ) ;
  6463. $form_object = $form_match [ 'object' ] ;
  6464.  
  6465. if ( ! isset ( $pdf_objects [ $form_object ] ) )
  6466. {
  6467. if ( self::$DEBUG )
  6468. warning ( "Form definitions object #$form_object not found in object #$object_id." ) ;
  6469.  
  6470. return ;
  6471. }
  6472. // Add this entry to form data information
  6473. $this -> FormData [ $object_id ] = array
  6474. (
  6475. 'values' => ( integer ) $field_object,
  6476. 'form' => ( integer ) $form_object
  6477. ) ;
  6478. }
  6479.  
  6480.  
  6481. }
  6482.  
  6483.  
  6484. /**************************************************************************************************************
  6485. **************************************************************************************************************
  6486. **************************************************************************************************************
  6487. ****** ******
  6488. ****** ******
  6489. ****** FONT TABLE MANAGEMENT ******
  6490. ****** ******
  6491. ****** ******
  6492. **************************************************************************************************************
  6493. **************************************************************************************************************
  6494. **************************************************************************************************************/
  6495.  
  6496. /*==============================================================================================================
  6497.  
  6498. PdfTexterFontTable class -
  6499. The PdfTexterFontTable class is not supposed to be used outside the context of the PdfToText class.
  6500. Its purposes are to hold a list of font definitions taken from a pdf document, along with their
  6501. associated character mapping tables, if any.
  6502. This is why no provision has been made to design this class a a general purpose class ; its utility
  6503. exists only in the scope of the PdfToText class.
  6504.  
  6505. ==============================================================================================================*/
  6506. class PdfTexterFontTable extends PdfObjectBase
  6507. {
  6508. // Font table
  6509. public $Fonts = array ( ) ;
  6510. private $DefaultFont = false ;
  6511. // Font mapping between a font number and an object number
  6512. private $FontMap = array ( ) ;
  6513. // A character map buffer is used to store results from previous calls to the MapCharacter() method of the
  6514. // FontTable object. It dramatically reduces the number of calls needed, from one call for each character
  6515. // defined in the pdf stream, to one call on each DISTINCT character defined in the PDF stream.
  6516. // As an example, imagine a PDF file that contains 200K characters, but only 150 distinct ones. The
  6517. // MapCharacter method will be called 150 times, instead of 200 000...
  6518. private $CharacterMapBuffer = array ( ) ;
  6519.  
  6520.  
  6521. // Constructor -
  6522. // Well, does not do anything special
  6523. public function __construct ( )
  6524. {
  6525. parent::__construct ( ) ;
  6526. }
  6527.  
  6528.  
  6529. // Add -
  6530. // Adds the current font declaration to the font table. Handles special cases where font id is not
  6531. // given by the object id, but rather by <</Rx...>> constructs
  6532. public function Add ( $object_id, $font_definition, $pdf_objects, $extra_mappings )
  6533. {
  6534. if ( PdfToText::$DEBUG )
  6535. {
  6536. echo "\n----------------------------------- FONT #$object_id\n" ;
  6537. echo $font_definition ;
  6538. }
  6539.  
  6540. $font_type = PdfTexterFont::FONT_ENCODING_STANDARD ;
  6541. $cmap_id = 0 ;
  6542. $secondary_cmap_id = 0 ;
  6543. $font_variant = false ;
  6544.  
  6545. // Font resource id specification
  6546. if ( preg_match ( '#<< \s* (?P<rscdefs> /R\d+ .*) >>#ix', $font_definition, $match ) )
  6547. {
  6548. $resource_definitions = $match [ 'rscdefs' ] ;
  6549.  
  6550. preg_match_all ( '#/R (?P<font_id> \d+) #ix', $resource_definitions, $id_matches ) ;
  6551. preg_match_all ( '#/ToUnicode \s* (?P<cmap_id> \d+)#ix', $resource_definitions, $cmap_matches ) ;
  6552.  
  6553. $count = count ( $id_matches [ 'font_id' ] ) ;
  6554.  
  6555. for ( $i = 0 ; $i < $count ; $i ++ )
  6556. {
  6557. $font_id = $id_matches [ 'font_id' ] [$i] ;
  6558. $cmap_id = $cmap_matches [ 'cmap_id' ] [$i] ;
  6559.  
  6560. $this -> Fonts [ $font_id ] = new PdfTexterFont ( $font_id, $cmap_id, PdfTexterFont::FONT_ENCODING_UNICODE_MAP, $extra_mappings ) ;
  6561. }
  6562.  
  6563. return ;
  6564. }
  6565. // Experimental implementation of CID fonts
  6566. else if ( preg_match ( '#/(Base)?Encoding \s* /Identity-H#ix', $font_definition ) )
  6567. {
  6568. if ( preg_match ( '#/BaseFont \s* /(?P<font> [^\s/]+)#ix', $font_definition, $match ) )
  6569. $font_variant = $match [ 'font' ] ;
  6570.  
  6571. $font_type = PdfTexterFont::FONT_ENCODING_CID_IDENTITY_H ;
  6572. }
  6573. // Font has an associated Unicode map (using the /ToUnicode keyword)
  6574. else if ( preg_match ( '#/ToUnicode \s* (?P<cmap> \d+)#ix', $font_definition, $match ) )
  6575. {
  6576. $cmap_id = $match [ 'cmap' ] ;
  6577. $font_type = PdfTexterFont::FONT_ENCODING_UNICODE_MAP ;
  6578.  
  6579. if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+)#ix', $font_definition, $secondary_match ) )
  6580. $secondary_cmap_id = $secondary_match [ 'cmap' ] ;
  6581. }
  6582. // Font has an associated character map (using a cmap id)
  6583. else if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+) \s+ \d+ #ix', $font_definition, $match ) )
  6584. {
  6585. $cmap_id = $match [ 'cmap' ] ;
  6586. $font_type = PdfTexterFont::FONT_ENCODING_PDF_MAP ;
  6587. }
  6588. // Font uses the Windows Ansi encoding
  6589. else if ( preg_match ( '#/(Base)?Encoding \s* /WinAnsiEncoding#ix', $font_definition ) )
  6590. {
  6591. $font_type = PdfTexterFont::FONT_ENCODING_WINANSI ;
  6592.  
  6593. if ( preg_match ( '# /BaseFont \s* / [a-z0-9_]+ \+ [a-z0-9_]+? Cyr #imsx', $font_definition ) )
  6594. $font_type |= PdfTexterFont::FONT_VARIANT_ISO8859_5 ;
  6595. }
  6596. // Font uses the Mac Roman encoding
  6597. else if ( preg_match ( '#/(Base)?Encoding \s* /MacRomanEncoding#ix', $font_definition ) )
  6598. $font_type = PdfTexterFont::FONT_ENCODING_MAC_ROMAN ;
  6599.  
  6600. $this -> Fonts [ $object_id ] = new PdfTexterFont ( $object_id, $cmap_id, $font_type, $secondary_cmap_id, $pdf_objects, $extra_mappings, $font_variant ) ;
  6601.  
  6602. // Arbitrarily set the default font to the first font encountered in the pdf file
  6603. if ( $this -> DefaultFont === false )
  6604. {
  6605. reset ( $this -> Fonts ) ;
  6606. $this -> DefaultFont = key ( $this -> Fonts ) ;
  6607. }
  6608. }
  6609.  
  6610.  
  6611. // AddFontMap -
  6612. // Process things like :
  6613. // <</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
  6614. // which maps font 1 (when specified with the /Fx instruction) to object 26,
  6615. // 2 to object 22 and 3 to object 18, respectively, in the above example.
  6616. // Found also a strange way of specifying a font mapping :
  6617. // <</f-0-0 5 0 R etc.
  6618. // And yet another one :
  6619. // <</C0_0 5 0 R
  6620. public function AddFontMap ( $object_id, $object_data )
  6621. {
  6622. $object_data = self::UnescapeHexCharacters ( $object_data ) ;
  6623.  
  6624. // The same object can hold different notations for font associations
  6625. if ( preg_match_all ( '# (?P<font> ' . self::$FontSpecifiers . ' ) \s+ (?P<object> \d+) #imsx', $object_data, $matches ) )
  6626. {
  6627. for ( $i = 0, $count = count ( $matches [ 'font' ] ) ; $i < $count ; $i ++ )
  6628. {
  6629. $font = $matches [ 'font' ] [$i] ;
  6630. $object = $matches [ 'object' ] [$i] ;
  6631.  
  6632. $this -> FontMap [ $font ] = $object ;
  6633. }
  6634. }
  6635. }
  6636.  
  6637.  
  6638. // AddPageFontMap -
  6639. // Adds font aliases to the current font map, in the form : "page:xobject:font".
  6640. // The associated value is the font object itself.
  6641. public function AddPageFontMap ( $map )
  6642. {
  6643. foreach ( $map as $map_entry )
  6644. {
  6645. $this -> FontMap [ $map_entry [ 'page' ] . ':' . $map_entry [ 'xobject-name' ] . ':' . $map_entry [ 'font-name' ] ] = $map_entry [ 'object' ] ;
  6646. }
  6647. }
  6648.  
  6649.  
  6650. // AddCharacterMap -
  6651. // Associates a character map to a font declaration that referenced it.
  6652. public function AddCharacterMap ( $cmap )
  6653. {
  6654. $status = false ;
  6655.  
  6656. // We loop through all fonts, since the same character map can be referenced by several font definitions
  6657. foreach ( $this -> Fonts as $font )
  6658. {
  6659. if ( $font -> CharacterMapId == $cmap -> ObjectId )
  6660. {
  6661. $font -> CharacterMap = $cmap ;
  6662. $status = true ;
  6663. }
  6664. else if ( $font -> SecondaryCharacterMapId == $cmap -> ObjectId )
  6665. {
  6666. $cmap -> Secondary = true ;
  6667. $font -> SecondaryCharacterMap = $cmap ;
  6668. $status = true ;
  6669. }
  6670. }
  6671.  
  6672. return ( $status ) ;
  6673. }
  6674.  
  6675.  
  6676. // GetFontAttributes -
  6677. // Gets the specified font width in hex digits and whether the font has a character map or not.
  6678. public function GetFontAttributes ( $page_number, $template, $font, &$font_map_width, &$font_mapped )
  6679. {
  6680. // Font considered as global to the document
  6681. if ( isset ( $this -> Fonts [ $font ] ) )
  6682. $key = $font ;
  6683. // Font not found : try to use the first one declared in the document
  6684. else
  6685. {
  6686. reset ( $this -> Fonts ) ;
  6687. $key = key ( $this -> Fonts ) ;
  6688. }
  6689.  
  6690. // Font has an associated character map
  6691. if ( $key && $this -> Fonts [ $key ] -> CharacterMap )
  6692. {
  6693. $font_map_width = $this -> Fonts [ $key ] -> CharacterMap -> HexCharWidth ;
  6694. $font_mapped = true ;
  6695.  
  6696. return ( true ) ;
  6697. }
  6698. // No character map : characters are specified as two hex digits
  6699. else
  6700. {
  6701. $font_map_width = 2 ;
  6702. $font_mapped = false ;
  6703.  
  6704. return ( false ) ;
  6705. }
  6706. }
  6707.  
  6708.  
  6709. // GetFontByMapId -
  6710. // Returns the font id (object id) associated with the specified mapped id.
  6711. public function GetFontByMapId ( $page_number, $template, $id )
  6712. {
  6713. if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
  6714. $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ;
  6715. else if ( isset ( $this -> FontMap [ $id ] ) )
  6716. $font_object = $this -> FontMap [ $id ] ;
  6717. else
  6718. $font_object = -1 ;
  6719.  
  6720. return ( $font_object ) ;
  6721. }
  6722.  
  6723.  
  6724. // GetFontObject -
  6725. // Returns the PdfTexterFont object for the given page, template and font id (in the form of "/something")
  6726. public function GetFontObject ( $page_number, $template, $id )
  6727. {
  6728. if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
  6729. $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ;
  6730. else if ( isset ( $this -> FontMap [ $id ] ) )
  6731. $font_object = $this -> FontMap [ $id ] ;
  6732. else
  6733. return ( false ) ;
  6734.  
  6735. if ( isset ( $this -> Fonts [ $font_object ] ) )
  6736. return ( $this -> Fonts [ $font_object ] ) ;
  6737. else
  6738. return ( false ) ;
  6739. }
  6740.  
  6741.  
  6742. // MapCharacter -
  6743. // Returns the character associated to the specified one.
  6744. public function MapCharacter ( $font, $ch, $return_false_on_failure = false )
  6745. {
  6746. if ( isset ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) )
  6747. return ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) ;
  6748.  
  6749. // Use the first declared font as the default font, if none defined
  6750. if ( $font == -1 )
  6751. $font = $this -> DefaultFont ;
  6752.  
  6753. $cache = true ;
  6754.  
  6755. if ( isset ( $this -> Fonts [ $font ] ) )
  6756. {
  6757. $font_object = $this -> Fonts [ $font ] ;
  6758.  
  6759. $code = $font_object -> MapCharacter ( $ch, $return_false_on_failure ) ;
  6760.  
  6761. if ( $font_object -> CharacterMap )
  6762. $cache = $font_object -> CharacterMap -> Cache ;
  6763. }
  6764. else
  6765. {
  6766. $code = $this -> CodePointToUtf8 ( $ch ) ;
  6767. }
  6768.  
  6769. if ( $cache )
  6770. $this -> CharacterMapBuffer [ $font ] [ $ch ] = $code ;
  6771.  
  6772. return ( $code ) ;
  6773. }
  6774. }
  6775.  
  6776.  
  6777. /**************************************************************************************************************
  6778. **************************************************************************************************************
  6779. **************************************************************************************************************
  6780. ****** ******
  6781. ****** ******
  6782. ****** FONT MANAGEMENT ******
  6783. ****** ******
  6784. ****** ******
  6785. **************************************************************************************************************
  6786. **************************************************************************************************************
  6787. **************************************************************************************************************/
  6788.  
  6789. /*==============================================================================================================
  6790.  
  6791. PdfTexterFont class -
  6792. The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
  6793. It holds an optional character mapping table associted with this font.
  6794. No provision has been made to design this class a a general purpose class ; its utility exists only in
  6795. the scope of the PdfToText class.
  6796.  
  6797. ==============================================================================================================*/
  6798. class PdfTexterFont extends PdfObjectBase
  6799. {
  6800. // Font encoding types, for fonts that are neither associated with a Unicode character map nor a PDF character map
  6801. const FONT_ENCODING_STANDARD = 0 ; // No character map, use the standard character set
  6802. const FONT_ENCODING_WINANSI = 1 ; // No character map, use the Windows Ansi character set
  6803. const FONT_ENCODING_MAC_ROMAN = 2 ; // No character map, use the MAC OS Roman character set
  6804. const FONT_ENCODING_UNICODE_MAP = 3 ; // Font has an associated unicode character map
  6805. const FONT_ENCODING_PDF_MAP = 4 ; // Font has an associated PDF character map
  6806. const FONT_ENCODING_CID_IDENTITY_H = 5 ; // CID font : IDENTITY-H
  6807.  
  6808. // Font variants
  6809. const FONT_VARIANT_STANDARD = 0x0000 ;
  6810. const FONT_VARIANT_ISO8859_5 = 0x1000 ; // Cyrillic
  6811.  
  6812. const FONT_VARIANT_MASK = 0xF000 ;
  6813. const FONT_VARIANT_SHIFT = 12 ;
  6814.  
  6815. // Font resource id (may be an object id, overridden by <</Rx...>> constructs
  6816. public $Id ;
  6817. // Font type and variant
  6818. public $FontType ;
  6819. public $FontVariant ;
  6820. // Character map id, specified by the /ToUnicode flag
  6821. public $CharacterMapId ;
  6822. // Secondary character map id, specified by the /Encoding flag and that can contain a /Differences flag
  6823. public $SecondaryCharacterMapId ;
  6824. // Optional character map, that may be set by the PdfToText::Load method just before processing text drawing blocks
  6825. public $CharacterMap = null ;
  6826. public $SecondaryCharacterMap = null ;
  6827. // Character widths
  6828. public $CharacterWidths = array ( ) ;
  6829. // Default character width, if not present in the $CharacterWidths array
  6830. public $DefaultWidth = 0 ;
  6831. private $GotWidthInformation = false ;
  6832. // A buffer for remembering character widths
  6833. protected $CharacterWidthsBuffer = array ( ) ;
  6834.  
  6835.  
  6836. // Constructor -
  6837. // Builds a PdfTexterFont object, using its resource id and optional character map id.
  6838. public function __construct ( $resource_id, $cmap_id, $font_type, $secondary_cmap_id = null, $pdf_objects = null, $extra_mappings = null, $font_variant = false )
  6839. {
  6840.  
  6841. parent::__construct ( ) ;
  6842.  
  6843. $this -> Id = $resource_id ;
  6844. $this -> CharacterMapId = $cmap_id ;
  6845. $this -> SecondaryCharacterMapId = $secondary_cmap_id ;
  6846. $this -> FontType = $font_type & ~self::FONT_VARIANT_MASK ;
  6847. $this -> FontVariant = ( $font_type >> self::FONT_VARIANT_SHIFT ) & 0x0F ;
  6848.  
  6849. // Instantiate the appropriate character map for this font
  6850. switch ( $this -> FontType )
  6851. {
  6852. case self::FONT_ENCODING_WINANSI :
  6853. $this -> CharacterMap = new PdfTexterAdobeWinAnsiMap ( $resource_id, $this -> FontVariant ) ;
  6854. break ;
  6855.  
  6856. case self::FONT_ENCODING_MAC_ROMAN :
  6857. $this -> CharacterMap = new PdfTexterAdobeMacRomanMap ( $resource_id, $this -> FontVariant ) ;
  6858. break ;
  6859.  
  6860. case self::FONT_ENCODING_CID_IDENTITY_H :
  6861. $this -> CharacterMap = new PdfTexterIdentityHCIDMap ( $resource_id, $font_variant ) ;
  6862. break ;
  6863.  
  6864. case self::FONT_ENCODING_PDF_MAP :
  6865. $this -> CharacterMap = new PdfTexterEncodingMap ( $cmap_id, $pdf_objects [ $cmap_id ], $extra_mappings ) ;
  6866. break ;
  6867.  
  6868. case self::FONT_ENCODING_UNICODE_MAP :
  6869. break ;
  6870.  
  6871. case self::FONT_ENCODING_STANDARD :
  6872. break ;
  6873.  
  6874. default :
  6875. if ( PdfToText::$DEBUG )
  6876. warning ( "Unknown font type #$font_type found for object #$resource_id, character map #$cmap_id." ) ;
  6877. }
  6878.  
  6879. // Get font data ; include font descriptor information if present
  6880. $font_data = $pdf_objects [ $resource_id ] ;
  6881.  
  6882. if ( preg_match ( '/FontDescriptor \s+ (?P<id> \d+) \s+ \d+ \s+ R/imsx', $font_data, $match ) )
  6883. {
  6884. $descriptor_id = $match [ 'id' ] ;
  6885.  
  6886. // Don't care about searching this in that object, or that in this object - simply catenate the font descriptor
  6887. // with the font definition
  6888. if ( isset ( $pdf_objects [ $descriptor_id ] ) )
  6889. $font_data .= $pdf_objects [ $descriptor_id ] ;
  6890. }
  6891.  
  6892. // Type1 fonts belong to the Adobe 14 standard fonts available. Information about the character widths is never embedded in the PDF
  6893. // file, but must be taken from external data (in the FontMetrics directory).
  6894. if ( preg_match ( '#/SubType \s* /Type1#ix', $font_data ) )
  6895. {
  6896. preg_match ( '#/BaseFont \s* / ([\w]+ \+)? (?P<font> [^\s\[</]+)#ix', $font_data, $match ) ;
  6897. $font_name = $match [ 'font' ] ;
  6898. $lc_font_name = strtolower ( $font_name ) ;
  6899.  
  6900. // Do that only if a font metrics file exists...
  6901. if ( isset ( PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ) )
  6902. {
  6903. $metrics_file = PdfToText::$FontMetricsDirectory . '/' . PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ;
  6904.  
  6905. if ( file_exists ( $metrics_file ) )
  6906. {
  6907. include ( $metrics_file ) ;
  6908.  
  6909. if ( isset ( $charwidths ) )
  6910. {
  6911. // Build the CharacterWidths table
  6912. foreach ( $charwidths as $char => $width )
  6913. $this -> CharacterWidths [ chr ( $char ) ] = ( double ) $width ;
  6914.  
  6915. $this -> GotWidthInformation = true ;
  6916. }
  6917. }
  6918. }
  6919. }
  6920.  
  6921. // Retrieve the character widths for this font. This means :
  6922. // - Retrieving the /FirstChar, /LastChar and /Widths entries from the font definition. /Widths is an array of individual character
  6923. // widths, between the /FirstChar and /LastChar entries. A value of zero in this array means "Use the default width"...
  6924. // - ... which is given by the /MissingWidth parameter, normally put in the font descriptor whose object id is given by the
  6925. // /FontDescriptor entry of the font definition
  6926. // Well, to be considered, given the number of buggy PDFs around the world, we won't care about the /LastChar entry and we won't
  6927. // check whether the /Widths array contains (LastChar - FirstChar + 1) integer values...
  6928. // Get the entries
  6929. $first_char = false ;
  6930. $widths = false ;
  6931. $missing_width = false ;
  6932.  
  6933. if ( preg_match ( '#/FirstChar \s+ (?P<char> \d+)#imsx', $font_data, $match ) )
  6934. $first_char = $match [ 'char' ] ;
  6935.  
  6936. if ( preg_match ( '#/Widths \s* \[ (?P<widths> [^\]]+) \]#imsx', $font_data, $match ) )
  6937. $widths = $match [ 'widths' ] ;
  6938.  
  6939. if ( preg_match ( '#/MissingWidth \s+ (?P<missing> \d+)#imsx', $font_data, $match ) )
  6940. $missing_width = $match [ 'missing' ] ;
  6941.  
  6942. // It would not make sense if one of the two entries /FirstChar and /Widths was missing
  6943. // So ensure they are all there (note that /MissingWidths can be absent)
  6944. if ( $first_char !== false && $widths )
  6945. {
  6946. if ( $missing_width !== false )
  6947. $this -> DefaultWidth = ( double ) $missing_width ;
  6948.  
  6949. // Here comes a really tricky part :
  6950. // - The PDF file can contain CharProcs (example names : /a0, /a1, etc.) for which we have no
  6951. // Unicode equivalent
  6952. // - The caller may have called the AddAdobeExtraMappings method, to providing a mapping between
  6953. // those char codes (/a0, /a1, etc.) and a Unicode equivalent
  6954. // - Each "charproc" listed in the /Differences array as a specific code, such as :
  6955. // [0/a1/a2/a3...]
  6956. // which maps /a1 to code 0, /a2 to code 1, and so on
  6957. // - However, the GetStringWidth() method provides real Unicode characters
  6958. // Consequently, we have to map each CharProc character (/a1, /a2, etc.) to the Unicode value
  6959. // that may have been specified using the AddAdobeExtraMappings() method.
  6960. // The first step below collects the name list of CharProcs.
  6961. $charprocs = false ;
  6962.  
  6963. if ( isset ( $this -> CharacterMap -> Encodings ) &&
  6964. preg_match ( '# /CharProcs \s* << (?P<list> .*?) >>#imsx', $font_data, $match ) )
  6965. {
  6966. preg_match_all ( '#/ (?P<char> \w+) \s+ \d+ \s+ \d+ \s+ R#msx', $match [ 'list' ], $char_matches ) ;
  6967.  
  6968. $charprocs = array_flip ( $char_matches [ 'char' ] ) ;
  6969. }
  6970.  
  6971. // The /FontMatrix entry defines the scaling to be used for the character widths (among other things)
  6972. if ( preg_match ( '#/FontMatrix \s* \[ \s* (?P<multiplier> \d+)#imsx', $font_data, $match ) )
  6973. $multiplier = 1000 * ( double ) $match [ 'multiplier' ] ;
  6974. else
  6975. $multiplier = 1 ;
  6976.  
  6977. $widths = trim ( preg_replace ( '/\s+/', ' ', $widths ) ) ;
  6978. $widths = explode ( ' ', $widths ) ;
  6979.  
  6980. for ( $i = 0, $count = count ( $widths ) ; $i < $count ; $i ++ )
  6981. {
  6982. $value = ( double ) trim ( $widths [$i] ) ;
  6983. $chr_index = $first_char + $i ;
  6984.  
  6985. // Tricky thing part 2 :
  6986. if ( $charprocs )
  6987. {
  6988. // If one of the CharProc characters is listed in the /Differences array then...
  6989. if ( isset ( $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ) )
  6990. {
  6991. $chname = $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ;
  6992.  
  6993. // ... if this CharProcs character is defined in the encoding table (possibly because
  6994. // it was complemeted through a call to the AddAdobeExtraMappings() method), then we
  6995. // will use its Unicode counterpart instead of the character ID coming from the
  6996. // /Differences array)
  6997. if ( isset ( $charprocs [ $chname ] ) && isset ( $this -> CharacterMap -> Encodings [ $chname ] ) )
  6998. $chr_index = $this -> CharacterMap -> Encodings [ $chname ] [2] ;
  6999. }
  7000. }
  7001.  
  7002. $this -> CharacterWidths [ chr ( $chr_index ) ] = ( $value ) ? ( $value * $multiplier ) : $this -> DefaultWidth ;
  7003. }
  7004.  
  7005. $this -> GotWidthInformation = true ;
  7006. }
  7007. }
  7008.  
  7009.  
  7010. // MapCharacter -
  7011. // Returns the substitution string value for the specified character, if the current font has an
  7012. // associated character map, or the original character encoded in utf8, if not.
  7013. public function MapCharacter ( $ch, $return_false_on_failure = false )
  7014. {
  7015. if ( $this -> CharacterMap )
  7016. {
  7017. // Character is defined in the character map ; check if it has been overridden by a /Differences array in
  7018. // a secondary character map
  7019. if ( isset ( $this -> CharacterMap [ $ch ] ) )
  7020. {
  7021. // Since a /ToUnicode map can have an associated /Encoding map with a /Differences list, this is the right place
  7022. // to perform the translation (ie, the final Unicode codepoint is impacted by the /Differences list)
  7023. if ( ! $this -> SecondaryCharacterMap ) // Most common case first !
  7024. {
  7025. $code = $this -> CharacterMap [ $ch ] ;
  7026. }
  7027. else
  7028. {
  7029. if ( isset ( $this -> SecondaryCharacterMap [ $ch ] ) )
  7030. $code = $this -> SecondaryCharacterMap [ $ch ] ;
  7031. else
  7032. $code = $this -> CharacterMap [ $ch ] ;
  7033. }
  7034.  
  7035. return ( $code ) ;
  7036. }
  7037. // On the contrary, the character may not be defined in the main character map but may exist in the secondary cmap
  7038. else if ( $this -> SecondaryCharacterMap && isset ( $this -> SecondaryCharacterMap [ $ch ] ) )
  7039. {
  7040. $code = $this -> SecondaryCharacterMap [ $ch ] ;
  7041.  
  7042. return ( $code ) ;
  7043. }
  7044. }
  7045.  
  7046. if ( $return_false_on_failure )
  7047. return ( false ) ;
  7048.  
  7049. return ( $this -> CodePointToUtf8 ( $ch ) ) ;
  7050. }
  7051.  
  7052.  
  7053. /*--------------------------------------------------------------------------------------------------------------
  7054.  
  7055. NAME
  7056. GetStringWidth - Returns the length of a string, in 1/100 of points
  7057.  
  7058. PROTOTYPE
  7059. $width = $font -> GetStringWidth ( $text, $extra_percent ) ;
  7060.  
  7061. DESCRIPTION
  7062. Returns the length of a string, in 1/100 of points.
  7063.  
  7064. PARAMETERS
  7065. $text (string) -
  7066. String whose length is to be measured.
  7067.  
  7068. $extra_percent (double) -
  7069. Extra percentage to be added to the computed width.
  7070.  
  7071. RETURN VALUE
  7072. Returns the length of the specified string in 1/1000 of text points, or 0 if the font does not
  7073. contain any character width information.
  7074.  
  7075. *-------------------------------------------------------------------------------------------------------------*/
  7076. public function GetStringWidth ( $text, $extra_percent )
  7077. {
  7078. // No width information
  7079. if ( ! $this -> GotWidthInformation )
  7080. return ( false ) ;
  7081.  
  7082. $width = 0 ;
  7083.  
  7084. // Compute the width of each individual character - use a character width buffer to avoid
  7085. // repeating the same tests again and again for characters whose width has already been processed
  7086. for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ )
  7087. {
  7088. $ch = $text [$i] ;
  7089.  
  7090. // Character already in the Widths buffer - Simply retrieve its value
  7091. if ( isset ( $this -> CharacterWidthsBuffer [ $ch ] ) )
  7092. {
  7093. $width += $this -> CharacterWidthsBuffer [ $ch ] ;
  7094. }
  7095. // New character - The width comes either from the CharacterWidths array if an entry is defined
  7096. // for this character, or from the default width property.
  7097. else
  7098. {
  7099. if ( isset ( $this -> CharacterWidths [ $ch ] ) )
  7100. {
  7101. $width += $this -> CharacterWidths [ $ch ] ;
  7102. $this -> CharacterWidthsBuffer [ $ch ] = $this -> CharacterWidths [ $ch ] ;
  7103. }
  7104. else
  7105. {
  7106. $width += $this -> DefaultWidth ;
  7107. $this -> CharacterWidthsBuffer [ $ch ] = $this -> DefaultWidth ;
  7108. }
  7109. }
  7110. }
  7111.  
  7112. // The computed width is actually longer/smaller than its actual width. Adjust by the percentage specified
  7113. // by the ExtraTextWidth property
  7114. $divisor = 100 - $extra_percent ;
  7115.  
  7116. if ( $divisor < 50 ) // Arbitrarily fix a limit
  7117. $divisor = 50 ;
  7118.  
  7119. // All done, return
  7120. return ( $width / $divisor ) ;
  7121. }
  7122. }
  7123.  
  7124.  
  7125. /*==============================================================================================================
  7126.  
  7127. PdfTexterCharacterMap -
  7128. The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
  7129. Describes a character map.
  7130. No provision has been made to design this class a a general purpose class ; its utility exists only in
  7131. the scope of the PdfToText class.
  7132.  
  7133. ==============================================================================================================*/
  7134. abstract class PdfTexterCharacterMap extends PdfObjectBase
  7135. implements ArrayAccess, Countable
  7136. {
  7137. // Object id of the character map
  7138. public $ObjectId ;
  7139. // Number of hex digits in a character represented in hexadecimal notation
  7140. public $HexCharWidth ;
  7141. // Set to true if the values returned by the array access operator can safely be cached
  7142. public $Cache = false ;
  7143.  
  7144.  
  7145.  
  7146. public function __construct ( $object_id )
  7147. {
  7148. parent::__construct ( ) ;
  7149. $this -> ObjectId = $object_id ;
  7150. }
  7151.  
  7152.  
  7153. /*--------------------------------------------------------------------------------------------------------------
  7154.  
  7155. CreateInstance -
  7156. Creates a PdfTexterCharacterMap instance of the correct type.
  7157.  
  7158. *-------------------------------------------------------------------------------------------------------------*/
  7159. public static function CreateInstance ( $object_id, $definitions, $extra_mappings )
  7160. {
  7161. if ( preg_match ( '# (begincmap) | (beginbfchar) | (beginbfrange) #ix', $definitions ) )
  7162. return ( new PdfTexterUnicodeMap ( $object_id, $definitions ) ) ;
  7163. else if ( stripos ( $definitions, '/Differences' ) !== false )
  7164. return ( new PdfTexterEncodingMap ( $object_id, $definitions, $extra_mappings ) ) ;
  7165. else
  7166. return ( false ) ;
  7167. }
  7168.  
  7169.  
  7170.  
  7171. /*--------------------------------------------------------------------------------------------------------------
  7172.  
  7173. Interface implementations.
  7174.  
  7175. *-------------------------------------------------------------------------------------------------------------*/
  7176. public function offsetSet ( $offset, $value )
  7177. { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
  7178.  
  7179. public function offsetUnset ( $offset )
  7180. { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
  7181. }
  7182.  
  7183.  
  7184.  
  7185. /*==============================================================================================================
  7186.  
  7187. PdfTexterUnicodeMap -
  7188. A class for fonts having a character map specified with the /ToUnicode parameter.
  7189.  
  7190. ==============================================================================================================*/
  7191. class PdfTexterUnicodeMap extends PdfTexterCharacterMap
  7192. {
  7193. // Id of the character map (specified by the /Rx flag)
  7194. public $Id ;
  7195. // Character substitution table, using the beginbfrange/endbfrange notation
  7196. // Only constructs of the form :
  7197. // <low> <high> <start>
  7198. // are stored in this table. Constructs of the form :
  7199. // <x> <y> [ <subst_x> <subst_x+1> ... <subst_y> ]
  7200. // are stored in the $DirectMap array, because it is conceptually the same thing in the end as a character substitution being
  7201. // defined with the beginbfchar/endbfchar construct.
  7202. // Note that a dichotomic search in $RangeMap will be performed for each character reference not yet seen in the pdf flow.
  7203. // Once the substitution character has been found, it will be added to the $DirectMap array for later faster access.
  7204. // The reason for this optimization is that some pdf files can contain beginbfrange/endbfrange constructs that may seem useless,
  7205. // except for validation purposes (ie, validating the fact that a character reference really belongs to the character map).
  7206. // However, such constructs can lead to thousands of character substitutions ; consider the following example, that comes
  7207. // from a sample I received :
  7208. // beginbfrange
  7209. // <1000> <1FFFF> <1000>
  7210. // <2000> <2FFFF> <2000>
  7211. // ...
  7212. // <A000> <AFFFF> <A0000>
  7213. // ...
  7214. // endbfrange
  7215. // By naively storing a one-to-one character relationship in an associative array, such as :
  7216. // $array [ 0x1000 ] = 0x1000 ;
  7217. // $array [ 0x1001 ] = 0x1001 ;
  7218. // ..
  7219. // $array [ 0x1FFF ] = 0x1FFF ;
  7220. // etc.
  7221. // you may arrive to a situation where the array becomes so big that it exhausts all of the available memory.
  7222. // This is why the ranges are stored as is and a dichotomic search is performed to go faster.
  7223. // Since it is useless to use this method to search the same character twice, when it has been found once, the
  7224. // substitution pair will be put in the $DirectMap array for subsequent accesses (there is little probability that a PDF
  7225. // file contains so much different characters, unless you are processing the whole Unicode table itself ! - but in this
  7226. // case, you will simply have to adjust the value of the memory_limit setting in your php.ini file. Consider that I am
  7227. // not a magician...).
  7228. protected $RangeMap = array ( ) ;
  7229. private $RangeCount = 0 ; // Avoid unnecessary calls to the count() function
  7230. private $RangeMin = PHP_INT_MAX, // Min and max values of the character ranges
  7231. $RangeMax = -1 ;
  7232. // Character substitution table for tables using the beginbfchar notation
  7233. protected $DirectMap = array ( ) ;
  7234.  
  7235.  
  7236. // Constructor -
  7237. // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
  7238. // beginbfrange/endbfrange constructs.
  7239. public function __construct ( $object_id, $definitions )
  7240. {
  7241. parent::__construct ( $object_id ) ;
  7242.  
  7243. if ( PdfToText::$DEBUG )
  7244. {
  7245. echo "\n----------------------------------- UNICODE CMAP #$object_id\n" ;
  7246. echo $definitions;
  7247. }
  7248.  
  7249. // Retrieve the cmap id, if any
  7250. preg_match ( '# /CMapName \s* /R (?P<num> \d+) #ix', $definitions, $match ) ;
  7251. $this -> Id = isset ( $match [ 'num' ] ) ? $match [ 'num' ] : -1 ;
  7252.  
  7253. // Get the codespace range, which will give us the width of a character specified in hexadecimal notation
  7254. preg_match ( '# begincodespacerange \s+ <\s* (?P<low> [0-9a-f]+) \s*> \s* <\s* (?P<high> [0-9a-f]+) \s*> \s*endcodespacerange #ix', $definitions, $match ) ;
  7255.  
  7256. if ( isset ( $match [ 'low' ] ) )
  7257. $this -> HexCharWidth = max ( strlen ( $match [ 'low' ] ), strlen ( $match [ 'high' ] ) ) ;
  7258. else
  7259. $this -> HexCharWidth = 0 ;
  7260.  
  7261. $max_found_char_width = 0 ;
  7262.  
  7263. // Process beginbfchar/endbfchar constructs
  7264. if ( preg_match_all ( '/ beginbfchar \s* (?P<chars> .*?) endbfchar /imsx', $definitions, $char_matches ) )
  7265. {
  7266. foreach ( $char_matches [ 'chars' ] as $char_list )
  7267. {
  7268. // beginbfchar / endbfchar constructs can behave as a kind of beginfbfrange/endbfrange ; example :
  7269. // <21> <0009 0020 000d>
  7270. // means :
  7271. // . Map character #21 to #0009
  7272. // . Map character #22 to #0020
  7273. // . Map character #23 to #000D
  7274. // There is no clue in the Adobe PDF specification that a single character could be mapped to a range.
  7275. // The normal constructs would be :
  7276. // <21> <0009>
  7277. // <22> <0020>
  7278. // <23> <0000D>
  7279. preg_match_all ( '/< \s* (?P<item> .*?) \s* >/msx', $char_list, $item_matches ) ;
  7280.  
  7281. for ( $i = 0, $item_count = count ( $item_matches [ 'item' ] ) ; $i < $item_count ; $i += 2 )
  7282. {
  7283. $char = hexdec ( $item_matches [ 'item' ] [$i] ) ;
  7284. $char_width = strlen ( $item_matches [ 'item' ] [$i] ) ;
  7285. $map = explode ( ' ', preg_replace ( '/\s+/', ' ', $item_matches [ 'item' ] [ $i + 1 ] ) ) ;
  7286.  
  7287. if ( $char_width > $max_found_char_width )
  7288. $max_found_char_width = $char_width ;
  7289.  
  7290. for ( $j = 0, $map_count = count ( $map ) ; $j < $map_count ; $j ++ )
  7291. {
  7292. $subst = hexdec ( $map [$j] ) ;
  7293.  
  7294. // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
  7295. // (but it still corresponds to something...)
  7296. if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
  7297. $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
  7298.  
  7299. $this -> DirectMap [ $char + $j ] = $subst ;
  7300. }
  7301. }
  7302.  
  7303. }
  7304. }
  7305.  
  7306. // Process beginbfrange/endbfrange constructs
  7307. if ( preg_match_all ( '/ beginbfrange \s* (?P<ranges> .*?) endbfrange /imsx', $definitions, $range_matches ) )
  7308. {
  7309. foreach ( $range_matches [ 'ranges' ] as $range_list )
  7310. {
  7311. $start_index = 0 ;
  7312.  
  7313. // There are two forms of syntax in a beginbfrange..endbfrange construct
  7314. // 1) "<x> <y> <z>", which maps character ids x through y to z through (z+y-x)
  7315. // 2) "<x> <y> [<a1> <a2> ... <an>]", which maps character x to a1, x+1 to a2, up to y, which is mapped to an
  7316. // All the values are hex digits.
  7317. // We will loop through the range definitions by first identifying the <x> and <y>, and the character that follows
  7318. // them, which is either a "<" for notation 1), or a "[" for notation 2).
  7319. while ( preg_match ( '# < \s* (?P<from> [0-9a-f]+) \s* > \s* < \s* (?P<to> [0-9a-f]+) \s* > \s* (?P<nextchar> .) #imsx',
  7320. $range_list, $range_match, PREG_OFFSET_CAPTURE, $start_index ) )
  7321. {
  7322. $from = hexdec ( $range_match [ 'from' ] [0] ) ;
  7323. $to = hexdec ( $range_match [ 'to' ] [0] ) ;
  7324. $next_char = $range_match [ 'nextchar' ] [0] ;
  7325. $next_char_index = $range_match [ 'nextchar' ] [1] ;
  7326. $char_width = strlen ( $range_match [ 'from' ] [0] ) ;
  7327.  
  7328. if ( $char_width > $max_found_char_width )
  7329. $max_found_char_width = $char_width ;
  7330.  
  7331. // Form 1) : catch the third hex value after <x> and <y>
  7332. if ( $next_char == '<' )
  7333. {
  7334. if ( preg_match ( '/ \s* (?P<start> [0-9a-f]+) (?P<tail> \s* > \s*) /imsx', $range_list, $start_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
  7335. {
  7336. $subst = hexdec ( $start_match [ 'start' ] [0] ) ;
  7337.  
  7338. // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
  7339. // (but it still corresponds to something...)
  7340. if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
  7341. $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
  7342.  
  7343. // Don't create a range if <x> and <y> are the same
  7344. if ( $from != $to )
  7345. {
  7346. $this -> RangeMap [] = array ( $from, $to, $subst ) ;
  7347.  
  7348. // Adjust min and max values for the ranges stored in this character map - to avoid unnecessary testing
  7349. if ( $from < $this -> RangeMin )
  7350. $this -> RangeMin = $from ;
  7351.  
  7352. if ( $to > $this -> RangeMax )
  7353. $this -> RangeMax = $to ;
  7354. }
  7355. else
  7356. $this -> DirectMap [ $from ] = $subst ;
  7357.  
  7358. $start_index = $start_match [ 'tail' ] [1] + 1 ;
  7359. }
  7360. else
  7361. error ( "Character range $from..$to not followed by an hexadecimal value in Unicode map #$object_id." ) ;
  7362. }
  7363. // Form 2) : catch all the hex values between square brackets after <x> and <y>
  7364. else if ( $next_char == '[' )
  7365. {
  7366. if ( preg_match ( '/ (?P<values> [\s<>0-9a-f]+ ) (?P<tail> \] \s*)/imsx', $range_list, $array_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
  7367. {
  7368. preg_match_all ( '/ < \s* (?P<num> [0-9a-f]+) \s* > /imsx', $array_match [ 'values' ] [0], $array_values ) ;
  7369.  
  7370. for ( $i = $from, $count = 0 ; $i <= $to ; $i ++, $count ++ )
  7371. $this -> DirectMap [$i] = hexdec ( $array_values [ 'num' ] [ $count ] ) ;
  7372.  
  7373. $start_index = $array_match [ 'tail' ] [1] + 1 ;
  7374. }
  7375. else
  7376. error ( "Character range $from..$to not followed by an array of hexadecimal values in Unicode map #$object_id." ) ;
  7377. }
  7378. else
  7379. {
  7380. error ( "Unexpected character '$next_char' in Unicode map #$object_id." ) ;
  7381. $start_index = $range_match [ 'nextchar' ] [1] + 1 ;
  7382. }
  7383. }
  7384. }
  7385.  
  7386. // Sort the ranges by their starting offsets
  7387. $this -> RangeCount = count ( $this -> RangeMap ) ;
  7388.  
  7389. if ( $this -> RangeCount > 1 )
  7390. {
  7391. usort ( $this -> RangeMap, array ( $this, '__rangemap_cmpfunc' ) ) ;
  7392. }
  7393. }
  7394.  
  7395. if ( $max_found_char_width && $max_found_char_width != $this -> HexCharWidth )
  7396. {
  7397. if ( PdfToText::$DEBUG )
  7398. warning ( "Character map #$object_id : specified code width ({$this -> HexCharWidth}) differs from actual width ($max_found_char_width)." ) ;
  7399.  
  7400. $this -> HexCharWidth = $max_found_char_width ;
  7401. }
  7402. }
  7403.  
  7404.  
  7405. public function __rangemap_cmpfunc ( $a, $b )
  7406. { return ( $a [0] - $b [0] ) ; }
  7407.  
  7408.  
  7409. /*--------------------------------------------------------------------------------------------------------------
  7410.  
  7411. Interface implementations.
  7412.  
  7413. *-------------------------------------------------------------------------------------------------------------*/
  7414. public function count ( )
  7415. { return ( count ( $this -> DirectMap ) ) ; }
  7416.  
  7417.  
  7418. public function offsetExists ( $offset )
  7419. { return ( $this -> offsetGetSafe ( $offset ) !== false ) ; }
  7420.  
  7421.  
  7422. public function offsetGetSafe ( $offset, $translate = true )
  7423. {
  7424. // Return value
  7425. $code = false ;
  7426.  
  7427. // Character already has an entry (character reference => subtituted character)
  7428. if ( isset ( $this -> DirectMap [ $offset ] ) )
  7429. {
  7430. $code = ( $translate ) ? $this -> CodePointToUtf8 ( $this -> DirectMap [ $offset ] ) : $this -> DirectMap [ $offset ] ;
  7431. }
  7432. // Character does not has a direct entry ; have a look in the character ranges defined for this map
  7433. else if ( $this -> RangeCount && $offset >= $this -> RangeMin && $offset <= $this -> RangeMax )
  7434. {
  7435. $low = 0 ;
  7436. $high = count ( $this -> RangeMap ) - 1 ;
  7437. $result = false ;
  7438.  
  7439. // Use a dichotomic search through character ranges
  7440. while ( $low <= $high )
  7441. {
  7442. $middle = ( $low + $high ) >> 1 ;
  7443.  
  7444. if ( $offset < $this -> RangeMap [ $middle ] [0] )
  7445. $high = $middle - 1 ;
  7446. else if ( $offset > $this -> RangeMap [ $middle ] [1] )
  7447. $low = $middle + 1 ;
  7448. else
  7449. {
  7450. $result = $this -> RangeMap [ $middle ] [2] + $offset - $this -> RangeMap [ $middle ] [0] ;
  7451. break ;
  7452. }
  7453. }
  7454.  
  7455. // Once a character has been found in the ranges defined by this character map, store it in the DirectMap property
  7456. // so that it will be directly retrieved during subsequent accesses
  7457. if ( $result !== false )
  7458. {
  7459. $code = ( $translate ) ? $this -> CodePointToUtf8 ( $result ) : $result ;
  7460. $this -> DirectMap [ $offset ] = $result ;
  7461. }
  7462. }
  7463.  
  7464. // All done, return
  7465. return ( $code ) ;
  7466. }
  7467.  
  7468.  
  7469. public function offsetGet ( $offset )
  7470. {
  7471. $code = $this -> offsetGetSafe ( $offset ) ;
  7472.  
  7473. if ( $code === false )
  7474. $code = $this -> CodePointToUtf8 ( $offset ) ;
  7475.  
  7476. return ( $code ) ;
  7477. }
  7478. }
  7479.  
  7480.  
  7481. /*==============================================================================================================
  7482.  
  7483. PdfTexterEncodingMap -
  7484. A class for fonts having a character map specified with the /Encoding parameter.
  7485.  
  7486. ==============================================================================================================*/
  7487. class PdfTexterEncodingMap extends PdfTexterCharacterMap
  7488. {
  7489. // Possible encodings (there is a 5th one, MacExpertEncoding, but used for "expert fonts" ; no need to deal
  7490. // with it here since we only want to extract text)
  7491. // Note that the values of these constants are direct indices to the second dimension of the $Encodings table
  7492. const PDF_STANDARD_ENCODING = 0 ;
  7493. const PDF_MAC_ROMAN_ENCODING = 1 ;
  7494. const PDF_WIN_ANSI_ENCODING = 2 ;
  7495. const PDF_DOC_ENCODING = 3 ;
  7496.  
  7497. // Correspondance between an encoding name and its corresponding character in the
  7498. // following format : Standard, Mac, Windows, Pdf
  7499. private static $GlobalEncodings = false ;
  7500. public $Encodings ;
  7501. // Encoding type (one of the PDF_*_ENCODING constants)
  7502. public $Encoding ;
  7503. // Indicates whether this character map is a secondary one used for Unicode maps ; this must be set at
  7504. // a higher level by the PdfTexterFont because at the time a character map is instantiated, we do not know
  7505. // yet whether it will be a primary (normal) map, or a map secondary to an existing Unicode map
  7506. public $Secondary ;
  7507. // Differences array (a character substitution table to the standard encodings)
  7508. public $Map = array ( ) ;
  7509. // A secondary map for the Differences array, which only contains the differences ; this is used
  7510. // for Unicode fonts that also have an associated /Differences parameter, which should not include the
  7511. // whole standard Adobe character map but only the differences of encodings
  7512. public $SecondaryMap = array ( ) ;
  7513. // Differences by position number
  7514. public $DifferencesByPosition = array ( ) ;
  7515.  
  7516.  
  7517. // Constructor -
  7518. // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
  7519. // beginbfrange/endbfrange constructs.
  7520. public function __construct ( $object_id, $definitions, $extra_mappings )
  7521. {
  7522. // Ignore character variants whose names end with these suffixes
  7523. static $IgnoredVariants = array
  7524. (
  7525. '/\.scalt$/',
  7526. '/\.sc$/',
  7527. '/\.fitted$/',
  7528. '/\.oldstyle$/',
  7529. '/\.taboldstyle$/',
  7530. '/\.alt$/',
  7531. '/alt$/',
  7532. ) ;
  7533.  
  7534. parent::__construct ( $object_id ) ;
  7535.  
  7536. // Load the default Adobe character sets, if not already done
  7537. if ( self::$GlobalEncodings === false )
  7538. {
  7539. $charset_file = dirname ( __FILE__ ) . '/Maps/adobe-charsets.map' ;
  7540. include ( $charset_file ) ;
  7541. self::$GlobalEncodings = ( isset ( $adobe_charsets ) ) ? $adobe_charsets : array ( ) ;
  7542. }
  7543.  
  7544. $this -> Encodings = array_merge ( self::$GlobalEncodings, $extra_mappings ) ;
  7545.  
  7546. // Fonts using default Adobe character sets and hexadecimal representations are one-byte long
  7547. $this -> HexCharWidth = 2 ;
  7548.  
  7549. if ( PdfToText::$DEBUG )
  7550. {
  7551. echo "\n----------------------------------- ENCODING CMAP #$object_id\n" ;
  7552. echo $definitions;
  7553. }
  7554.  
  7555. // Retrieve text encoding
  7556. preg_match ( '# / (?P<encoding> (WinAnsiEncoding) | (PDFDocEncoding) | (MacRomanEncoding) | (StandardEncoding) ) #ix',
  7557. $definitions, $encoding_match ) ;
  7558.  
  7559. if ( ! isset ( $encoding_match [ 'encoding' ] ) )
  7560. $encoding_match [ 'encoding' ] = 'WinAnsiEncoding' ;
  7561.  
  7562. switch ( strtolower ( $encoding_match [ 'encoding' ] ) )
  7563. {
  7564. case 'pdfdocencoding' : $this -> Encoding = self::PDF_DOC_ENCODING ; break ;
  7565. case 'macromanencoding' : $this -> Encoding = self::PDF_MAC_ROMAN_ENCODING ; break ;
  7566. case 'standardencoding' : $this -> Encoding = self::PDF_STANDARD_ENCODING ; break ;
  7567. case 'winansiencoding' :
  7568. default : $this -> Encoding = self::PDF_WIN_ANSI_ENCODING ;
  7569. }
  7570.  
  7571. // Build a virgin character map using the detected encoding
  7572. foreach ( $this -> Encodings as $code_array )
  7573. {
  7574. $char = $code_array [ $this -> Encoding ] ;
  7575. $this -> Map [ $char ] = $char ;
  7576. }
  7577.  
  7578. // Extract the Differences array
  7579. preg_match ( '/ \[ \s* (?P<contents> [^\]]*?) \s* \] /x', $definitions, $match ) ;
  7580.  
  7581. if ( ! isset ( $match [ 'contents' ] ) )
  7582. return ;
  7583.  
  7584. $data = trim ( preg_replace ( '/\s+(\d+)/', '/$1', $match [ 'contents' ] ) ) ;
  7585. $items = explode ( '/', $data ) ;
  7586. $index = 0 ;
  7587.  
  7588. for ( $i = 0, $item_count = count ( $items ) ; $i < $item_count ; $i ++ )
  7589. {
  7590. $item = PdfToText::DecodeRawName ( trim ( $items [$i] ) ) ;
  7591.  
  7592. // Integer value : index of next character in map
  7593. if ( is_numeric ( $item ) )
  7594. $index = ( integer ) $item ;
  7595. // String value : a character name, as defined by Adobe
  7596. else
  7597. {
  7598. // Remove variant part of the character name
  7599. $item = preg_replace ( $IgnoredVariants, '', trim ( $item ) ) ;
  7600.  
  7601. // Keyword (character name) exists in the encoding table
  7602. if ( isset ( $this -> Encodings [ $item ] ) )
  7603. {
  7604. $this -> Map [ $index ] =
  7605. $this -> SecondaryMap [ $index ] = $this -> Encodings [ $item ] [ $this -> Encoding ] ;
  7606. }
  7607. // Not defined ; check if this is the "/gxx" notation, where "xx" is a number
  7608. else if ( preg_match ( '/g (?P<value> \d+)/x', $item, $match ) )
  7609. {
  7610. $value = ( integer ) $match [ 'value' ] ;
  7611.  
  7612. // In my current state of investigations, the /g notation has the following characteristics :
  7613. // - The value 29 must be added to the number after the "/g" string (why ???)
  7614. // - The value after the "/g" string can be greater than 255, meaning that it could be Unicode codepoint
  7615. // This has to be carefully watched before revision
  7616. $value += 29 ;
  7617.  
  7618. $this -> Map [ $index ] =
  7619. $this -> SecondaryMap [ $index ] = $value ;
  7620. }
  7621. // Some characters can be specified by the "/uni" prefix followed by a sequence of hex digits,
  7622. // which is not described by the PDF specifications. This sequence gives a Unicode code point.
  7623. else if ( preg_match ( '/uni (?P<value> [0-9a-f]+)/ix', $item, $match ) )
  7624. {
  7625. $value = hexdec ( $match [ 'value' ] ) ;
  7626.  
  7627. $this -> Map [ $index ] =
  7628. $this -> SecondaryMap [ $index ] = ( integer ) $value ;
  7629. }
  7630. // Otherwise, put a quotation mark instead
  7631. else
  7632. {
  7633. if ( PdfToText::$DEBUG )
  7634. warning ( "Unknown character name found in a /Differences[] array : [$item]" ) ;
  7635.  
  7636. $this -> Map [ $index ] =
  7637. $this -> SecondaryMap [ $index ] = ord ( '?' ) ;
  7638. }
  7639.  
  7640. $this -> DifferencesByPosition [ $index ] = $item ;
  7641.  
  7642. $index ++ ;
  7643. }
  7644. }
  7645. }
  7646.  
  7647.  
  7648. /*--------------------------------------------------------------------------------------------------------------
  7649.  
  7650. Interface implementations.
  7651.  
  7652. *-------------------------------------------------------------------------------------------------------------*/
  7653. public function count ( )
  7654. { return ( count ( $this -> Map ) ) ; }
  7655.  
  7656.  
  7657. public function offsetExists ( $offset )
  7658. {
  7659. return ( ( ! $this -> Secondary ) ?
  7660. isset ( $this -> Map [ $offset ] ) :
  7661. isset ( $this -> SecondaryMap [ $offset ] ) ) ;
  7662. }
  7663.  
  7664.  
  7665. public function offsetGet ( $offset )
  7666. {
  7667. if ( ! $this -> Secondary )
  7668. {
  7669. if ( isset ( $this -> Map [ $offset ] ) )
  7670. $ord = $this -> Map [ $offset ] ;
  7671. else
  7672. $ord = $offset ;
  7673.  
  7674. // Check for final character translations (concerns only a few number of characters)
  7675. if ( $this -> Encoding == self::PDF_WIN_ANSI_ENCODING && isset ( PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ) )
  7676. $ord = PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ;
  7677. else if ( $this -> Encoding == self::PDF_MAC_ROMAN_ENCODING && isset ( PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ) )
  7678. $ord = PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ;
  7679. // As far as I have been able to see, the values expressed by the /Differences tag were the only ones used within the
  7680. // Pdf document ; however, handle the case where some characters do not belong to the characters listed by /Differences,
  7681. // and use the official Adobe encoding maps when necessary
  7682. else if ( isset ( $this -> Encodings [ $ord ] [ $this -> Encoding ] ) )
  7683. $ord = $this -> Encodings [ $ord ] [ $this -> Encoding ] ;
  7684.  
  7685. $result = $this -> CodePointToUtf8 ( $ord ) ;
  7686. }
  7687. else if ( isset ( $this -> SecondaryMap [ $offset ] ) )
  7688. {
  7689. $ord = $this -> SecondaryMap [ $offset ] ;
  7690. $result = $this -> CodePointToUtf8 ( $ord ) ;
  7691. }
  7692. else
  7693. $result = false ;
  7694.  
  7695. return ( $result ) ;
  7696. }
  7697. }
  7698.  
  7699.  
  7700. /**************************************************************************************************************
  7701. **************************************************************************************************************
  7702. **************************************************************************************************************
  7703. ****** ******
  7704. ****** ******
  7705. ****** CHARACTER MAP MANAGEMENT ******
  7706. ****** ******
  7707. ****** ******
  7708. **************************************************************************************************************
  7709. **************************************************************************************************************
  7710. **************************************************************************************************************/
  7711.  
  7712. /*==============================================================================================================
  7713.  
  7714. class PdfTexterAdobeMap -
  7715. Abstract class to handle Adobe-specific fonts.
  7716.  
  7717. ==============================================================================================================*/
  7718. abstract class PdfTexterAdobeMap extends PdfTexterCharacterMap
  7719. {
  7720. // Font variant ; one of the PdfTexterFont::FONT_VARIANT_* constants
  7721. public $Variant ;
  7722. // To be declared by derived classes :
  7723. public $Map ;
  7724.  
  7725.  
  7726. public function __construct ( $object_id, $font_variant, $map )
  7727. {
  7728. parent::__construct ( $object_id ) ;
  7729.  
  7730. $this -> HexCharWidth = 2 ;
  7731. $this -> Variant = $font_variant ;
  7732. $this -> Map = $map ;
  7733.  
  7734. if ( ! isset ( $map [ $font_variant ] ) )
  7735. error ( new PdfToTextDecodingException ( "Undefined font variant #$font_variant." ) ) ;
  7736. }
  7737.  
  7738.  
  7739. /*--------------------------------------------------------------------------------------------------------------
  7740.  
  7741. Interface implementations.
  7742.  
  7743. *-------------------------------------------------------------------------------------------------------------*/
  7744. public function count ( )
  7745. { return ( count ( $this -> $Map [ $this -> Variant ] ) ) ; }
  7746.  
  7747.  
  7748. public function offsetExists ( $offset )
  7749. { return ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) ) ; }
  7750.  
  7751.  
  7752. public function offsetGet ( $offset )
  7753. {
  7754. if ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) )
  7755. $ord = $this -> Map [ $this -> Variant ] [ $offset ] ;
  7756. else
  7757. $ord = $offset ;
  7758.  
  7759. return ( $this -> CodePointToUtf8 ( $ord ) ) ;
  7760. }
  7761. }
  7762.  
  7763.  
  7764. /*==============================================================================================================
  7765.  
  7766. class PdfTexterAdobeWinAnsiMap -
  7767. Abstract class to handle Adobe-specific Win Ansi fonts.
  7768.  
  7769. ==============================================================================================================*/
  7770. class PdfTexterAdobeWinAnsiMap extends PdfTexterAdobeMap
  7771. {
  7772. // Windows Ansi mapping to Unicode. Only substitutions that have no direct equivalent are listed here
  7773. // Source : https://msdn.microsoft.com/en-us/goglobal/cc305145.aspx
  7774. // Only characters from 0x80 to 0x9F have no direct translation
  7775. public static $WinAnsiCharacterMap = array
  7776. (
  7777. // Normal WinAnsi mapping
  7778. 0 => array
  7779. (
  7780. 0x80 => 0x20AC,
  7781. 0x82 => 0x201A,
  7782. 0x83 => 0x0192,
  7783. 0x84 => 0x201E,
  7784. 0x85 => 0x2026,
  7785. 0x86 => 0x2020,
  7786. 0x87 => 0x2021,
  7787. 0x88 => 0x02C6,
  7788. 0x89 => 0x2030,
  7789. 0x8A => 0x0160,
  7790. 0x8B => 0x2039,
  7791. 0x8C => 0x0152,
  7792. 0x8E => 0x017D,
  7793. 0x91 => 0x2018,
  7794. 0x92 => 0x2019,
  7795. 0x93 => 0x201C,
  7796. 0x94 => 0x201D,
  7797. 0x95 => 0x2022,
  7798. 0x96 => 0x2013,
  7799. 0x97 => 0x2014,
  7800. 0x98 => 0x02DC,
  7801. 0x99 => 0x2122,
  7802. 0x9A => 0x0161,
  7803. 0x9B => 0x203A,
  7804. 0x9C => 0x0153,
  7805. 0x9E => 0x017E,
  7806. 0x9F => 0x0178
  7807. ),
  7808. // Cyrillic (IS08859-5)
  7809. 1 => array
  7810. (
  7811. 0x93 => 0x0022, // Quotes
  7812. 0x94 => 0x0022,
  7813. 0xC0 => 0x0410,
  7814. 0xC1 => 0x0411,
  7815. 0xC2 => 0x0412,
  7816. 0xC3 => 0x0413,
  7817. 0xC4 => 0x0414,
  7818. 0xC5 => 0x0415,
  7819. 0xC6 => 0x0416,
  7820. 0xC7 => 0x0417,
  7821. 0xC8 => 0x0418,
  7822. 0xC9 => 0x0419,
  7823. 0xCA => 0x041A,
  7824. 0xCB => 0x041B,
  7825. 0xCC => 0x041C,
  7826. 0xCD => 0x041D,
  7827. 0xCE => 0x041E,
  7828. 0xCF => 0x041F,
  7829. 0xD0 => 0x0420,
  7830. 0xD1 => 0x0421,
  7831. 0xD2 => 0x0422,
  7832. 0xD3 => 0x0423,
  7833. 0xD4 => 0x0424,
  7834. 0xD5 => 0x0425,
  7835. 0xD6 => 0x0426,
  7836. 0xD7 => 0x0427,
  7837. 0xD8 => 0x0428,
  7838. 0xD9 => 0x0429,
  7839. 0xDA => 0x042A,
  7840. 0xDB => 0x042B,
  7841. 0xDC => 0x042C,
  7842. 0xDD => 0x042D,
  7843. 0xDE => 0x042E,
  7844. 0xDF => 0x042F,
  7845. 0xE0 => 0x0430,
  7846. 0xE1 => 0x0431,
  7847. 0xE2 => 0x0432,
  7848. 0xE3 => 0x0433,
  7849. 0xE4 => 0x0434,
  7850. 0xE5 => 0x0435,
  7851. 0xE6 => 0x0436,
  7852. 0xE7 => 0x0437,
  7853. 0xE8 => 0x0438,
  7854. 0xE9 => 0x0439,
  7855. 0xEA => 0x043A,
  7856. 0xEB => 0x043B,
  7857. 0xEC => 0x043C,
  7858. 0xED => 0x043D,
  7859. 0xEE => 0x043E,
  7860. 0xEF => 0x043F,
  7861. 0xF0 => 0x0440,
  7862. 0xF1 => 0x0441,
  7863. 0xF2 => 0x0442,
  7864. 0xF3 => 0x0443,
  7865. 0xF4 => 0x0444,
  7866. 0xF5 => 0x0445,
  7867. 0xF6 => 0x0446,
  7868. 0xF7 => 0x0447,
  7869. 0xF8 => 0x0448,
  7870. 0xF9 => 0x0449,
  7871. 0xFA => 0x044A,
  7872. 0xFB => 0x044B,
  7873. 0xFC => 0x044C,
  7874. 0xFD => 0x044D,
  7875. 0xFE => 0x044E,
  7876. 0xFF => 0x044F
  7877. )
  7878. ) ;
  7879.  
  7880. public function __construct ( $object_id, $font_variant )
  7881. {
  7882. parent::__construct ( $object_id, $font_variant, self::$WinAnsiCharacterMap ) ;
  7883. }
  7884. }
  7885.  
  7886.  
  7887. /*==============================================================================================================
  7888.  
  7889. class PdfTexterAdobeMacRomanMap -
  7890. Abstract class to handle Adobe-specific Mac Roman fonts.
  7891.  
  7892. ==============================================================================================================*/
  7893. class PdfTexterAdobeMacRomanMap extends PdfTexterAdobeMap
  7894. {
  7895. // Mac roman to Unicode encoding
  7896. // Source : ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
  7897. public static $MacRomanCharacterMap = array
  7898. (
  7899. 0 => array
  7900. (
  7901. 0x80 => 0x00C4, # LATIN CAPITAL LETTER A WITH DIAERESIS
  7902. 0x81 => 0x00C5, # LATIN CAPITAL LETTER A WITH RING ABOVE
  7903. 0x82 => 0x00C7, # LATIN CAPITAL LETTER C WITH CEDILLA
  7904. 0x83 => 0x00C9, # LATIN CAPITAL LETTER E WITH ACUTE
  7905. 0x84 => 0x00D1, # LATIN CAPITAL LETTER N WITH TILDE
  7906. 0x85 => 0x00D6, # LATIN CAPITAL LETTER O WITH DIAERESIS
  7907. 0x86 => 0x00DC, # LATIN CAPITAL LETTER U WITH DIAERESIS
  7908. 0x87 => 0x00E1, # LATIN SMALL LETTER A WITH ACUTE
  7909. 0x88 => 0x00E0, # LATIN SMALL LETTER A WITH GRAVE
  7910. 0x89 => 0x00E2, # LATIN SMALL LETTER A WITH CIRCUMFLEX
  7911. 0x8A => 0x00E4, # LATIN SMALL LETTER A WITH DIAERESIS
  7912. 0x8B => 0x00E3, # LATIN SMALL LETTER A WITH TILDE
  7913. 0x8C => 0x00E5, # LATIN SMALL LETTER A WITH RING ABOVE
  7914. 0x8D => 0x00E7, # LATIN SMALL LETTER C WITH CEDILLA
  7915. 0x8E => 0x00E9, # LATIN SMALL LETTER E WITH ACUTE
  7916. 0x8F => 0x00E8, # LATIN SMALL LETTER E WITH GRAVE
  7917. 0x90 => 0x00EA, # LATIN SMALL LETTER E WITH CIRCUMFLEX
  7918. 0x91 => 0x00EB, # LATIN SMALL LETTER E WITH DIAERESIS
  7919. 0x92 => 0x00ED, # LATIN SMALL LETTER I WITH ACUTE
  7920. 0x93 => 0x00EC, # LATIN SMALL LETTER I WITH GRAVE
  7921. 0x94 => 0x00EE, # LATIN SMALL LETTER I WITH CIRCUMFLEX
  7922. 0x95 => 0x00EF, # LATIN SMALL LETTER I WITH DIAERESIS
  7923. 0x96 => 0x00F1, # LATIN SMALL LETTER N WITH TILDE
  7924. 0x97 => 0x00F3, # LATIN SMALL LETTER O WITH ACUTE
  7925. 0x98 => 0x00F2, # LATIN SMALL LETTER O WITH GRAVE
  7926. 0x99 => 0x00F4, # LATIN SMALL LETTER O WITH CIRCUMFLEX
  7927. 0x9A => 0x00F6, # LATIN SMALL LETTER O WITH DIAERESIS
  7928. 0x9B => 0x00F5, # LATIN SMALL LETTER O WITH TILDE
  7929. 0x9C => 0x00FA, # LATIN SMALL LETTER U WITH ACUTE
  7930. 0x9D => 0x00F9, # LATIN SMALL LETTER U WITH GRAVE
  7931. 0x9E => 0x00FB, # LATIN SMALL LETTER U WITH CIRCUMFLEX
  7932. 0x9F => 0x00FC, # LATIN SMALL LETTER U WITH DIAERESIS
  7933. 0xA0 => 0x2020, # DAGGER
  7934. 0xA1 => 0x00B0, # DEGREE SIGN
  7935. 0xA2 => 0x00A2, # CENT SIGN
  7936. 0xA3 => 0x00A3, # POUND SIGN
  7937. 0xA4 => 0x00A7, # SECTION SIGN
  7938. 0xA5 => 0x2022, # BULLET
  7939. 0xA6 => 0x00B6, # PILCROW SIGN
  7940. 0xA7 => 0x00DF, # LATIN SMALL LETTER SHARP S
  7941. 0xA8 => 0x00AE, # REGISTERED SIGN
  7942. 0xA9 => 0x00A9, # COPYRIGHT SIGN
  7943. 0xAA => 0x2122, # TRADE MARK SIGN
  7944. 0xAB => 0x00B4, # ACUTE ACCENT
  7945. 0xAC => 0x00A8, # DIAERESIS
  7946. 0xAD => 0x2260, # NOT EQUAL TO
  7947. 0xAE => 0x00C6, # LATIN CAPITAL LETTER AE
  7948. 0xAF => 0x00D8, # LATIN CAPITAL LETTER O WITH STROKE
  7949. 0xB0 => 0x221E, # INFINITY
  7950. 0xB1 => 0x00B1, # PLUS-MINUS SIGN
  7951. 0xB2 => 0x2264, # LESS-THAN OR EQUAL TO
  7952. 0xB3 => 0x2265, # GREATER-THAN OR EQUAL TO
  7953. 0xB4 => 0x00A5, # YEN SIGN
  7954. 0xB5 => 0x00B5, # MICRO SIGN
  7955. 0xB6 => 0x2202, # PARTIAL DIFFERENTIAL
  7956. 0xB7 => 0x2211, # N-ARY SUMMATION
  7957. 0xB8 => 0x220F, # N-ARY PRODUCT
  7958. 0xB9 => 0x03C0, # GREEK SMALL LETTER PI
  7959. 0xBA => 0x222B, # INTEGRAL
  7960. 0xBB => 0x00AA, # FEMININE ORDINAL INDICATOR
  7961. 0xBC => 0x00BA, # MASCULINE ORDINAL INDICATOR
  7962. 0xBD => 0x03A9, # GREEK CAPITAL LETTER OMEGA
  7963. 0xBE => 0x00E6, # LATIN SMALL LETTER AE
  7964. 0xBF => 0x00F8, # LATIN SMALL LETTER O WITH STROKE
  7965. 0xC0 => 0x00BF, # INVERTED QUESTION MARK
  7966. 0xC1 => 0x00A1, # INVERTED EXCLAMATION MARK
  7967. 0xC2 => 0x00AC, # NOT SIGN
  7968. 0xC3 => 0x221A, # SQUARE ROOT
  7969. 0xC4 => 0x0192, # LATIN SMALL LETTER F WITH HOOK
  7970. 0xC5 => 0x2248, # ALMOST EQUAL TO
  7971. 0xC6 => 0x2206, # INCREMENT
  7972. 0xC7 => 0x00AB, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  7973. 0xC8 => 0x00BB, # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
  7974. 0xC9 => 0x2026, # HORIZONTAL ELLIPSIS
  7975. 0xCA => 0x00A0, # NO-BREAK SPACE
  7976. 0xCB => 0x00C0, # LATIN CAPITAL LETTER A WITH GRAVE
  7977. 0xCC => 0x00C3, # LATIN CAPITAL LETTER A WITH TILDE
  7978. 0xCD => 0x00D5, # LATIN CAPITAL LETTER O WITH TILDE
  7979. 0xCE => 0x0152, # LATIN CAPITAL LIGATURE OE
  7980. 0xCF => 0x0153, # LATIN SMALL LIGATURE OE
  7981. 0xD0 => 0x2013, # EN DASH
  7982. 0xD1 => 0x2014, # EM DASH
  7983. 0xD2 => 0x201C, # LEFT DOUBLE QUOTATION MARK
  7984. 0xD3 => 0x201D, # RIGHT DOUBLE QUOTATION MARK
  7985. 0xD4 => 0x2018, # LEFT SINGLE QUOTATION MARK
  7986. 0xD5 => 0x2019, # RIGHT SINGLE QUOTATION MARK
  7987. 0xD6 => 0x00F7, # DIVISION SIGN
  7988. 0xD7 => 0x25CA, # LOZENGE
  7989. 0xD8 => 0x00FF, # LATIN SMALL LETTER Y WITH DIAERESIS
  7990. 0xD9 => 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
  7991. 0xDA => 0x2044, # FRACTION SLASH
  7992. 0xDB => 0x20AC, # EURO SIGN
  7993. 0xDC => 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  7994. 0xDD => 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  7995. 0xDE => 0xFB01, # LATIN SMALL LIGATURE FI
  7996. 0xDF => 0xFB02, # LATIN SMALL LIGATURE FL
  7997. 0xE0 => 0x2021, # DOUBLE DAGGER
  7998. 0xE1 => 0x00B7, # MIDDLE DOT
  7999. 0xE2 => 0x201A, # SINGLE LOW-9 QUOTATION MARK
  8000. 0xE3 => 0x201E, # DOUBLE LOW-9 QUOTATION MARK
  8001. 0xE4 => 0x2030, # PER MILLE SIGN
  8002. 0xE5 => 0x00C2, # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
  8003. 0xE6 => 0x00CA, # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
  8004. 0xE7 => 0x00C1, # LATIN CAPITAL LETTER A WITH ACUTE
  8005. 0xE8 => 0x00CB, # LATIN CAPITAL LETTER E WITH DIAERESIS
  8006. 0xE9 => 0x00C8, # LATIN CAPITAL LETTER E WITH GRAVE
  8007. 0xEA => 0x00CD, # LATIN CAPITAL LETTER I WITH ACUTE
  8008. 0xEB => 0x00CE, # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
  8009. 0xEC => 0x00CF, # LATIN CAPITAL LETTER I WITH DIAERESIS
  8010. 0xED => 0x00CC, # LATIN CAPITAL LETTER I WITH GRAVE
  8011. 0xEE => 0x00D3, # LATIN CAPITAL LETTER O WITH ACUTE
  8012. 0xEF => 0x00D4, # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
  8013. 0xF0 => 0xF8FF, # Apple logo
  8014. 0xF1 => 0x00D2, # LATIN CAPITAL LETTER O WITH GRAVE
  8015. 0xF2 => 0x00DA, # LATIN CAPITAL LETTER U WITH ACUTE
  8016. 0xF3 => 0x00DB, # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
  8017. 0xF4 => 0x00D9, # LATIN CAPITAL LETTER U WITH GRAVE
  8018. 0xF5 => 0x0131, # LATIN SMALL LETTER DOTLESS I
  8019. 0xF6 => 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
  8020. 0xF7 => 0x02DC, # SMALL TILDE
  8021. 0xF8 => 0x00AF, # MACRON
  8022. 0xF9 => 0x02D8, # BREVE
  8023. 0xFA => 0x02D9, # DOT ABOVE
  8024. 0xFB => 0x02DA, # RING ABOVE
  8025. 0xFC => 0x00B8, # CEDILLA
  8026. 0xFD => 0x02DD, # DOUBLE ACUTE ACCENT
  8027. 0xFE => 0x02DB, # OGONEK
  8028. 0xFF => 0x02C7 # CARON
  8029. )
  8030. ) ;
  8031.  
  8032.  
  8033. public function __construct ( $object_id, $font_variant )
  8034. {
  8035. parent::__construct ( $object_id, $font_variant, self::$MacRomanCharacterMap ) ;
  8036. }
  8037. }
  8038.  
  8039.  
  8040. /*==============================================================================================================
  8041.  
  8042. class PdfTexterAdobeUndocumentedUnicodeMap -
  8043. Sometimes, Unicode maps translate character ids to something in the range 0xF000..0xF0FF (or maybe more).
  8044. These mapped characters do not correspond to anything else in Unicode, but rather to a special character
  8045. set.
  8046. This class is not meant to be instantiated by anything here, but rather used for its $Map property.
  8047. Note that the $Map array is not complete.
  8048.  
  8049. ==============================================================================================================*/
  8050. class PdfTexterAdobeUndocumentedUnicodeMap extends PdfTexterAdobeMap
  8051. {
  8052. public static $UnicodeMap = array
  8053. (
  8054. 0xF0F0 => 0x30, // '0' through '9'
  8055. 0xF0EF => 0x31,
  8056. 0xF0EE => 0x32,
  8057. 0xF0ED => 0x33,
  8058. 0xF0EC => 0x34,
  8059. 0xF0EB => 0x35,
  8060. 0xF0EA => 0x36,
  8061. 0xF0E9 => 0x37,
  8062. 0xF0E8 => 0x38,
  8063. 0xF0E7 => 0x39,
  8064. 0xF0DF => 0x41, // 'A' through 'Z'
  8065. 0xF0DE => 0x42,
  8066. 0xF0DD => 0x43,
  8067. 0xF0DC => 0x44,
  8068. 0xF0DB => 0x45,
  8069. 0xF0DA => 0x46,
  8070. 0xF0D9 => 0x47,
  8071. 0xF0D8 => 0x48,
  8072. 0xF0D7 => 0x49,
  8073. 0xF0D6 => 0x4A,
  8074. 0xF0D5 => 0x4B,
  8075. 0xF0D4 => 0x4C,
  8076. 0xF0D3 => 0x4D,
  8077. 0xF0D2 => 0x4E,
  8078. 0xF0D1 => 0x4F,
  8079. 0xF0D0 => 0x50,
  8080. 0xF0CF => 0x51,
  8081. 0xF0CE => 0x52,
  8082. 0xF0CD => 0x53,
  8083. 0xF0CC => 0x54,
  8084. 0xF0CB => 0x55,
  8085. 0xF0CA => 0x56,
  8086. 0xF0C9 => 0x57,
  8087. 0xF0C8 => 0x58,
  8088. 0xF0C7 => 0x59,
  8089. 0xF0C6 => 0x5A,
  8090. 0xF0BF => 0x61, // 'a' through 'z'
  8091. 0xF0BE => 0x62,
  8092. 0xF0BD => 0x63,
  8093. 0xF0BC => 0x64,
  8094. 0xF0BB => 0x65,
  8095. 0xF0BA => 0x66,
  8096. 0xF0B9 => 0x67,
  8097. 0xF0B8 => 0x68,
  8098. 0xF0B7 => 0x69,
  8099. 0xF0B6 => 0x6A,
  8100. 0xF0B5 => 0x6B,
  8101. 0xF0B4 => 0x6C,
  8102. 0xF0B3 => 0x6D,
  8103. 0xF0B2 => 0x6E,
  8104. 0xF0B1 => 0x6F,
  8105. 0xF0B0 => 0x70,
  8106. 0xF0AF => 0x71,
  8107. 0xF0AE => 0x72,
  8108. 0xF0AD => 0x73,
  8109. 0xF0AC => 0x74,
  8110. 0xF0AB => 0x75,
  8111. 0xF0AA => 0x76,
  8112. 0xF0A9 => 0x77,
  8113. 0xF0A8 => 0x78,
  8114. 0xF0A7 => 0x79,
  8115. 0xF0A6 => 0x7A,
  8116. 0xF0F1 => 0x2F, // '/'
  8117. 0xF0E6 => 0x3A, // ':'
  8118. 0xF0F3 => 0x2D, // '-'
  8119. 0xF0F8 => 0x28, // '('
  8120. 0xF0F7 => 0x29, // ')'
  8121. 0xF0F2 => 0x2E, // '.'
  8122. 0xF020 => 0x20, // Space
  8123. 0xF0F9 => 0x27, // "'"
  8124. 0xF037 => 0xE9, // &eacute;
  8125. 0xF038 => 0xE8, // &egrave;
  8126. ) ;
  8127.  
  8128.  
  8129.  
  8130. public function __construct ( $object_id, $font_variant )
  8131. {
  8132. parent::__construct ( $object_id, $font_variant, self::$UnicodeMap ) ;
  8133. }
  8134. }
  8135.  
  8136.  
  8137. /*==============================================================================================================
  8138.  
  8139. PdfTexterCIDMap -
  8140. A class for mapping (or trying to...) CID fonts.
  8141.  
  8142. ==============================================================================================================*/
  8143. abstract class PdfTexterCIDMap extends PdfTexterCharacterMap
  8144. {
  8145. // CID maps are associative arrays whose keys are the font CID (currently expressed as a numeric value) and
  8146. // whose values are the corresponding UTF8 representation. The following special values can also be used to
  8147. // initialize certain entries :
  8148. // UNKNOWN_CID :
  8149. // Indicates that the corresponding CID has no known UTF8 counterpart. When the PdfToText::$DEBUG variable
  8150. // is true, every character in this case will be replaced with the string : "[UID: abcd]", where "abcd" is
  8151. // the hex representation of the CID. This way, new CID tables can be built using this information.
  8152. const UNKNOWN_CID = -1 ;
  8153. // ALT_CID :
  8154. // Sorry, this will remain undocumented so far and will be highligh subject to change, since it is dating
  8155. // from my first interpretation of CID fonts, which is probably wrong.
  8156. const ALT_CID = -2 ;
  8157.  
  8158.  
  8159. // CID font map file ; the file is a PHP script that must contain an array of the form :
  8160. // $map = array
  8161. // (
  8162. // 'plain' => array
  8163. // (
  8164. // $cid1 => $utf1,
  8165. // ...
  8166. // )
  8167. // ) ;
  8168. protected $MapFile ;
  8169. // Map, loaded into memry
  8170. protected $Map ;
  8171. // Map cache - the interest is to avoid unnecessary includes
  8172. private static $CachedMaps = array ( ) ;
  8173.  
  8174. // Related to the first experimentatl implementation of CID fonts
  8175. private $LastAltOffset = false ;
  8176.  
  8177.  
  8178. /*--------------------------------------------------------------------------------------------------------------
  8179.  
  8180. Constructor -
  8181. Loads the specified map.
  8182. If the map files contains a definition such as :
  8183.  
  8184. $map = 'IDENTITY-H-GQJGLM.cid' ;
  8185.  
  8186. then the specified map will be loaded instead (ony one ndirection is supported).
  8187.  
  8188. *-------------------------------------------------------------------------------------------------------------*/
  8189. public function __construct ( $object_id, $map_name, $font_variant )
  8190. {
  8191. // Initialize parent objects
  8192. parent::__construct ( $object_id ) ;
  8193. $this -> HexCharWidth = 4 ; // So far, CIDs are 2-bytes long
  8194.  
  8195. // Since alternate characters can be apparently prefixed by 0x0000 or 0x0001, two calls to the array access operator
  8196. // will be needed to retrieve the exact character in such cases
  8197. // This is why we have to tell the upper layers not to cache the results
  8198. $this -> Cache = false ;
  8199.  
  8200. $map_index = "$map_name:$font_variant" ;
  8201.  
  8202. // If this font has already been loaded somewhere, then reuse its information
  8203. if ( isset ( self::$CachedMaps [ $map_index] ) )
  8204. {
  8205. $map = self::$CachedMaps [ $map_index ] [ 'map' ] ;
  8206. $file = self::$CachedMaps [ $map_index ] [ 'file' ] ;
  8207. }
  8208. // Otherwise,
  8209. else
  8210. {
  8211. $file = $this -> __get_cid_file ( $map_name, $font_variant ) ;
  8212.  
  8213. // No CID map found : CID numbers will be mapped as is
  8214. if ( ! file_exists ( $file ) )
  8215. {
  8216. if ( PdfToText::$DEBUG )
  8217. warning ( new PdfToTextDecodingException ( "Could not find CID table \"$map_name\" in directory \"" . PdfToText::$CIDTablesDirectory . "\"." ) ) ;
  8218. }
  8219. // Otherwise, load the CID map
  8220. else
  8221. {
  8222. include ( $file ) ;
  8223.  
  8224. if ( isset ( $map ) )
  8225. {
  8226. // We authorize one CID map to contain the name of another CID map file, instead of the map itself
  8227. if ( is_string ( $map ) )
  8228. {
  8229. $file = PdfToText::$CIDTablesDirectory . "/$map" ;
  8230. include ( $file ) ;
  8231. }
  8232.  
  8233. if ( isset ( $map ) )
  8234. self::$CachedMaps [ $map_index ] = array ( 'file' => $file, 'map' => $map ) ;
  8235. }
  8236. else if ( PdfToText::$DEBUG )
  8237. warning ( new PdfToTextDecodingException ( "CID \"$file\" does not contain any definition." ) ) ;
  8238. }
  8239. }
  8240.  
  8241. // Save map info for this CID font
  8242. $this -> MapFile = $file ;
  8243. $this -> Map = ( isset ( $map ) ) ? $map : array ( ) ;
  8244. }
  8245.  
  8246.  
  8247. /*--------------------------------------------------------------------------------------------------------------
  8248.  
  8249. __get_cid_file -
  8250. Searches in the CIDTables directory for the CID map that best matches the specified map name (usually,
  8251. IDENTITY-H) and the optional font variant.
  8252.  
  8253. If a font variant has been specified, like "ABCD+Italic-Arial", then the CID tables directory will be
  8254. searched for the following files, in the following order :
  8255. - IDENTITY-H-ABCD+Italic-Arial.cid
  8256. - IDENTITY-H-ABCD+Italic.cid
  8257. - IDENTITY-H-ABCD.cid
  8258. - If none found, then IDENTITY-H-empty.cid will be used and a warning will be issued in debug mode.
  8259.  
  8260. *-------------------------------------------------------------------------------------------------------------*/
  8261. private function __get_cid_file ( $map_name, $font_variant )
  8262. {
  8263. $files = array ( ) ;
  8264.  
  8265. // Search for font variants, if any
  8266. if ( $font_variant )
  8267. {
  8268. if ( preg_match ( '/^ (?P<name> [a-z_][a-z_0-9]*) (?P<rest> [\-+] .*) $/imsx' , $font_variant, $match ) )
  8269. {
  8270. $basename = '-' . $match [ 'name' ] ;
  8271.  
  8272. if ( preg_match_all ( '/ (?P<sep> [\-+]) (?P<name> [^\-+]+) /ix', $match [ 'rest' ], $other_matches ) )
  8273. {
  8274. for ( $i = count ( $other_matches [ 'name' ] ) - 1 ; $i >= 0 ; $i -- )
  8275. {
  8276. $new_file = $basename ;
  8277.  
  8278. for ( $j = 0 ; $j < $i ; $j ++ )
  8279. $new_file .= $other_matches [ 'sep' ] [$i] . $other_matches [ 'name' ] [$i] ;
  8280.  
  8281. $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name$new_file.cid", 'standard' ) ;
  8282. }
  8283. }
  8284. }
  8285.  
  8286. // Last one will be the empty CID font
  8287. $files [] = array ( PdfToText::$CIDTablesDirectory . "/IDENTITY-H-empty.cid", 'empty' ) ;
  8288. }
  8289.  
  8290. // Add the specified map file
  8291. $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name.cid", 'default' ) ;
  8292.  
  8293. // The first existing file in the list should be the appropriate one
  8294. foreach ( $files as $file )
  8295. {
  8296. if ( file_exists ( $file [0] ) )
  8297. {
  8298. if ( PdfToText::$DEBUG )
  8299. {
  8300. if ( $file [1] === 'empty' )
  8301. warning ( new PdfToTextDecodingException ( "Using empty IDENTITY-H definition for map \"$map_name\", variant \"$font_variant\"." ) ) ;
  8302. else if ( $file [1] === 'default' )
  8303. warning ( new PdfToTextDecodingException ( "Using default IDENTITY-H definition for map \"$map_name\"." ) ) ;
  8304. }
  8305.  
  8306. return ( $file [0] ) ;
  8307. }
  8308. }
  8309.  
  8310. // No CID font found
  8311. return ( false ) ;
  8312. }
  8313.  
  8314.  
  8315. /*--------------------------------------------------------------------------------------------------------------
  8316.  
  8317. Interface implementations.
  8318.  
  8319. *-------------------------------------------------------------------------------------------------------------*/
  8320. public function count ( )
  8321. { return ( count ( $this -> Map ) ) ; }
  8322.  
  8323.  
  8324. public function offsetExists ( $offset )
  8325. { return ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) ) ; }
  8326.  
  8327.  
  8328. public function offsetGet ( $offset )
  8329. {
  8330. if ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) )
  8331. {
  8332. $ch = $this -> Map [ 'plain' ] [ $offset ] ;
  8333.  
  8334. switch ( $ch )
  8335. {
  8336. case self::UNKNOWN_CID :
  8337. if ( PdfToText::$DEBUG )
  8338. echo ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
  8339.  
  8340. $this -> LastAltOffset = false ;
  8341.  
  8342. if ( ! PdfToText::$DEBUG )
  8343. return ( '' ) ;
  8344. else
  8345. return ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
  8346.  
  8347. case self::ALT_CID :
  8348. $this -> LastAltOffset = ( integer ) $offset ;
  8349.  
  8350. return ( '' ) ;
  8351.  
  8352. default :
  8353. if ( $this -> LastAltOffset === false )
  8354. return ( $ch ) ;
  8355.  
  8356. if ( isset ( $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ) )
  8357. {
  8358. $ch2 = $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ;
  8359.  
  8360. if ( $ch2 == self::UNKNOWN_CID )
  8361. {
  8362. if ( PdfToText::$DEBUG )
  8363. {
  8364. echo ( "[CID{$this -> LastAltOffset}:" . sprintf ( '%04x', $offset ) . "]" ) ;
  8365.  
  8366. $ch2 = "[CID{$this -> LastAltOffset}: $offset]" ;
  8367. }
  8368. }
  8369. }
  8370. else
  8371. $ch2 = '' ;
  8372.  
  8373. $this -> LastAltOffset = false ;
  8374.  
  8375. return ( $ch2 ) ;
  8376. }
  8377. }
  8378. else
  8379. {
  8380. $this -> LastAltOffset = false ;
  8381.  
  8382. return ( '' ) ;
  8383. }
  8384. }
  8385. }
  8386.  
  8387.  
  8388.  
  8389. /*==============================================================================================================
  8390.  
  8391. PdfTexterIdentityHCIDMap -
  8392. A class for mapping IDENTITY-H CID fonts (or trying to...).
  8393.  
  8394. ==============================================================================================================*/
  8395. class PdfTexterIdentityHCIDMap extends PdfTexterCIDMap
  8396. {
  8397. public function __construct ( $object_id, $font_variant )
  8398. {
  8399. parent::__construct ( $object_id, 'IDENTITY-H', $font_variant ) ;
  8400. }
  8401. }
  8402.  
  8403.  
  8404.  
  8405. /*==============================================================================================================
  8406.  
  8407. PdfTexterPageMap -
  8408. A class for detecting page objects mappings and retrieving page number for a specified object.
  8409. There is a quadruple level of indirection here :
  8410.  
  8411. - The first level contains a /Type /Catalog parameter, with a /Pages one that references an object which
  8412. contains a /Count and /Kids. I don't know yet if the /Pages parameter can reference more than one
  8413. object using the array notation. However, the class is designed to handle such situations.
  8414. - The object containing the /Kids parameter references objects who, in turn, lists the objects contained
  8415. into one single page.
  8416. - Each object referenced in /Kids has a /Type/Page parameter, together with /Contents, which lists the
  8417. objects of the current page.
  8418.  
  8419. Object references are of the form : "x y R", where "x" is the object number.
  8420.  
  8421. Of course, anything can be in any order, otherwise it would not be funny ! Consider the following
  8422. example :
  8423.  
  8424. (1) 5 0 obj
  8425. << ... /Pages 1 0 R ... >>
  8426. endobj
  8427.  
  8428. (2) 1 0 obj
  8429. << ... /Count 1 /Kids[6 0 R] ... /Type/Pages ... >>
  8430. endobj
  8431.  
  8432. (3) 6 0 obj
  8433. << ... /Type/Page ... /Parent 1 0 R ... /Contents [10 0 R 11 0 R ... x 0 R]
  8434. endobj
  8435.  
  8436. Object #5 says that object #1 contains the list of page contents (in this example, there is only one page,
  8437. referenced by object #6).
  8438. Object #6 says that the objects #10, #11 through #x are contained into the same page.
  8439. The quadruple indirection comes when you are handling one of the objects referenced in object #6 and you
  8440. need to retrieve their page number...
  8441.  
  8442. Of course, you cannot rely on the fact that all objects appear in logical order.
  8443.  
  8444. And, of course #2, there may be no page catalog at all ! in such cases, objects containing drawing
  8445. instructions will have to be considered as a single page, whose number will be sequential.
  8446.  
  8447. And, of course #3, as this is the case with the official PDF 1.7 Reference from Adobe, there can be a
  8448. reference to a non-existing object which was meant to contain the /Kids parameter (!). In this case,
  8449. taking the ordinal number of objects of type (3) gives the page number minus one.
  8450.  
  8451. One mystery is that the PDF 1.7 Reference file contains 1310 pages but only 1309 are recognized here...
  8452.  
  8453. ==============================================================================================================*/
  8454. class PdfTexterPageMap extends PdfObjectBase
  8455. {
  8456. // Page contents are (normally) first described by a catalog
  8457. // Although there should be only one entry for that, this property is defined as an array, as you need to really
  8458. // become paranoid when handling pdf contents...
  8459. protected $PageCatalogs = array ( ) ;
  8460. // Entries that describe which page contains which text objects. Of course, these can be nested otherwise it would not be funny !
  8461. protected $PageKids = array ( ) ;
  8462. // Terminal entries : they directly give the ids of the objects belonging to a page
  8463. public $PageContents = array ( ) ;
  8464. // Note that all the above arrays are indexed by object id and filled with the data collected by calling the Peek() Method...
  8465.  
  8466. // Objects that could be referenced from other text objects as XObjects, using the /TPLx notation
  8467. protected $TemplateObjects = array ( ) ;
  8468.  
  8469. // Once the Peek() method has collected page contents & object information, the MapCatalog() method is called to create this array
  8470. // which contains page numbers as keys, and the list of objects contained in this page as values
  8471. public $Pages = array ( ) ;
  8472. // Holds page attributes
  8473. public $PageAttributes = array ( ) ;
  8474.  
  8475. // Resource mappings can either refer to an object (/Resources 2 0 R) or to inline mappings (/Resources << ... >>)
  8476. // The same object can be referenced by many /Resources parameters throughout the pdf file, so its important to keep
  8477. // the analyzed mappings in a cache, so that later references will reuse the results of the first one
  8478. private $ResourceMappingCache = array ( ) ;
  8479. // List of XObject names - Used by the IsValidTemplate() function
  8480. private $XObjectNames = array ( ) ;
  8481.  
  8482.  
  8483. /*--------------------------------------------------------------------------------------------------------------
  8484.  
  8485. CONSTRUCTOR
  8486. Creates a PdfTexterPageMap object. Actually, nothing significant is perfomed here, as this class' goal
  8487. is to be used internally by PdfTexter.
  8488.  
  8489. *-------------------------------------------------------------------------------------------------------------*/
  8490. public function __construct ( )
  8491. {
  8492. parent::__construct ( ) ;
  8493. }
  8494.  
  8495.  
  8496. /*--------------------------------------------------------------------------------------------------------------
  8497.  
  8498. NAME
  8499. AddTemplateObject - Adds an object that could be referenced as a template/
  8500.  
  8501. PROTOTYPE
  8502. $pagemap -> AddTemplateObject ( $object_id, $object_text_data ) ;
  8503.  
  8504. DESCRIPTION
  8505. Adds an object that may be referenced as a template from another text object, using the /TPLx notation.
  8506.  
  8507. PARAMETERS
  8508. $object_id (integer) -
  8509. Id of the object that may contain a resource mapping entry.
  8510.  
  8511. $object_data (string) -
  8512. Object contents.
  8513.  
  8514. *-------------------------------------------------------------------------------------------------------------*/
  8515. public function AddTemplateObject ( $object_id, $object_text_data )
  8516. {
  8517. $this -> TemplateObjects [ $object_id ] = $object_text_data ;
  8518. }
  8519.  
  8520.  
  8521. /*--------------------------------------------------------------------------------------------------------------
  8522.  
  8523. NAME
  8524. GetResourceMappings - Gets resource mappings specified after a /Resources parameter.
  8525.  
  8526. PROTOTYPE
  8527. $result = $this -> GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list ) ;
  8528.  
  8529. DESCRIPTION
  8530. Most of the time, objects containing a page description (/Type /Page) also contain a /Resources parameter,
  8531. which may be followed by one of the following constructs :
  8532. - A reference to an object, such as :
  8533. /Resources 2 0 R
  8534. - Or an inline set of parameters, such as font or xobject mappings :
  8535. /Resources << /Font<</F1 10 0 R ...>> /XObject <</Im0 27 0 R ...>>
  8536. This method extracts alias/object mappings for the parameter specified by $parameter (it can be for
  8537. example 'Font' or 'Xobject') and returns these mappings as an associative array.
  8538.  
  8539. PARAMETERS
  8540. $object_id (integer) -
  8541. Id of the object that may contain a resource mapping entry.
  8542.  
  8543. $object_data (string) -
  8544. Object contents.
  8545.  
  8546. $parameter (string) -
  8547. Parameter defining resource mapping, for example /Font or /XObject.
  8548.  
  8549. $pdf_object_list (associative array) -
  8550. Array of object id/object data associations, for all objects defined in the pdf file.
  8551.  
  8552. RETURN VALUE
  8553. The list of resource mappings for the specified parameter, as an associative array, whose keys are the
  8554. resource aliases and values are the corresponding object ids.
  8555. The method returns an empty array if the specified object does not contain resource mappings or does
  8556. not contain the specified parameter.
  8557.  
  8558. *-------------------------------------------------------------------------------------------------------------*/
  8559. protected function GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list )
  8560. {
  8561. // The /Resources parameter refers to an existing PDF object
  8562. if ( preg_match ( '#/Resources \s* (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $object_data, $match ) )
  8563. {
  8564. // Return the cached result if the same object has previously been referenced by a /Resources parameter
  8565. if ( isset ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) )
  8566. return ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) ;
  8567.  
  8568. // Check that the object that is referred to exists
  8569. if ( isset ( $pdf_object_list [ $match [ 'object_id' ] ] ) )
  8570. $data = $pdf_object_list [ $match [ 'object_id' ] ] ;
  8571. else
  8572. return ( array ( ) ) ;
  8573.  
  8574. $is_object = true ; // to tell that we need to put the results in cache for later use
  8575. }
  8576. // The /Resources parameter is followed by inline mappings
  8577. else if ( preg_match ( '#/Resources \s* <#ix', $object_data, $match, PREG_OFFSET_CAPTURE ) )
  8578. {
  8579. $data = substr ( $object_data, $match [0] [1] + strlen ( $match [0] [0] ) - 1 ) ;
  8580. $is_object = false ;
  8581. }
  8582. else
  8583. return ( array ( ) ) ;
  8584.  
  8585. // Whatever we will be analyzing (an object contents or inline contents following the /Resources parameter),
  8586. // the text will be enclosed within double angle brackets (<< ... >>)
  8587.  
  8588. // A small kludge for /XObject which specify an object reference ("15 0 R") instead of XObjects mappings
  8589. // ("<< ...>>" )
  8590. if ( $parameter == '/XObject' && preg_match ( '#/XObject \s+ (?P<obj> \d+) \s+ \d+ \s+ R#ix', $data, $match ) )
  8591. {
  8592. $data = '/XObject ' . $pdf_object_list [ $match [ 'obj' ] ] ;
  8593. }
  8594.  
  8595. if ( preg_match ( "#$parameter \s* << \s* (?P<mappings> .*?) \s* >>#imsx", $data, $match ) )
  8596. {
  8597. preg_match_all ( '# (?P<mapping> / [^\s]+) \s+ (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $match [ 'mappings' ], $matches ) ;
  8598.  
  8599. $mappings = array ( ) ;
  8600.  
  8601. // Mapping extraction loop
  8602. for ( $i = 0, $count = count ( $matches [ 'object_id' ] ) ; $i < $count ; $i ++ )
  8603. $mappings [ $matches [ 'mapping' ] [$i] ] = $matches [ 'object_id' ] [$i] ;
  8604.  
  8605. // Put results for referenced objects in cache
  8606. if ( $is_object )
  8607. $this -> ResourceMappingCache [ $object_id ] [ $parameter ] = $mappings ;
  8608.  
  8609. return ( $mappings ) ;
  8610. }
  8611. else
  8612. return ( array ( ) ) ;
  8613. }
  8614.  
  8615.  
  8616. /*--------------------------------------------------------------------------------------------------------------
  8617.  
  8618. NAME
  8619. Peek - Peeks page information from a pdf object.
  8620.  
  8621. PROTOTYPE
  8622. $pagemap -> Peek ( ) ;
  8623.  
  8624. DESCRIPTION
  8625. Retrieves page information which can be of type (1), (2) or (3), as described in the class comments.
  8626.  
  8627. PARAMETERS
  8628. $object_id (integer) -
  8629. Id of the current pdf object.
  8630.  
  8631. $object_data (string) -
  8632. Pdf object contents.
  8633.  
  8634. $pdf_objects (associative array) -
  8635. Objects defined in the pdf file, as an associative array whose keys are object numbers and
  8636. values object data.
  8637. This parameter is used for /Type/Page objects which have a /Resource parameter that references
  8638. an existing object instead of providing font mappings and other XObject mappings inline,
  8639. enclosed within double angle brackets (<< /Font ... >>).
  8640.  
  8641. *-------------------------------------------------------------------------------------------------------------*/
  8642. public function Peek ( $object_id, $object_data, $pdf_objects )
  8643. {
  8644. // Page catalog (/Type/Catalog and /Pages x 0 R)
  8645. if ( preg_match ( '#/Type \s* /Catalog#ix', $object_data ) && $this -> GetObjectReferences ( $object_id, $object_data, '/Pages', $references ) )
  8646. $this -> PageCatalogs = array_merge ( $this -> PageCatalogs, $references ) ;
  8647. // Object listing the object numbers that give the list of objects contained in a single page (/Types/Pages and /Count x /Kids[x1 0 R ... xn 0 R]
  8648. else if ( preg_match ( '#/Type \s* /Pages#ix', $object_data ) )
  8649. {
  8650. if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Kids', $references ) )
  8651. {
  8652. // Sometimes, a reference can be the one of an object that contains the real reference ; in the following example,
  8653. // the actual page contents are not in object 4, but in object 5
  8654. // /Kids 4 0 R
  8655. // ...
  8656. // 4 0 obj
  8657. // [5 0 R]
  8658. // endobj
  8659. $new_references = array ( ) ;
  8660.  
  8661. foreach ( $references as $reference )
  8662. {
  8663. if ( ! isset ( $pdf_objects [ $reference ] ) ||
  8664. ! preg_match ( '/^ \s* (?P<ref> \[ [^]]+ \]) \s*$/imsx', $pdf_objects [ $reference ], $match ) )
  8665. {
  8666. $new_references [] = $reference ;
  8667. }
  8668. else
  8669. {
  8670. $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $sub_references ) ;
  8671. $new_references = array_merge ( $new_references, $sub_references ) ;
  8672. }
  8673.  
  8674. }
  8675.  
  8676. // Get kid count (knowing that sometimes, it is missing...)
  8677. preg_match ( '#/Count \s+ (?P<count> \d+)#ix', $object_data, $match ) ;
  8678. $page_count = ( isset ( $match [ 'count' ] ) ) ? ( integer ) $match [ 'count' ] : false ;
  8679.  
  8680. // Get parent object id
  8681. preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
  8682. $parent = ( isset ( $match [ 'parent' ] ) ) ? ( integer ) $match [ 'parent' ] : false ;
  8683.  
  8684. $this -> PageKids [ $object_id ] = array
  8685. (
  8686. 'object' => $object_id,
  8687. 'parent' => $parent,
  8688. 'count' => $page_count,
  8689. 'kids' => $new_references
  8690. ) ;
  8691. }
  8692. }
  8693. // Object listing the other objects that are contained in this page (/Type/Page and /Contents[x1 0 R ... xn 0 R]
  8694. else if ( preg_match ( '#/Type \s* /Page\b#ix', $object_data ) )
  8695. {
  8696. if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) )
  8697. {
  8698. preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
  8699. $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ;
  8700. $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
  8701. $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
  8702.  
  8703. // Find the width and height of the page (/Mediabox parameter)
  8704. if ( preg_match ( '#/MediaBox \s* \[ \s* (?P<x1> \d+) \s+ (?P<y1> \d+) \s+ (?P<x2> \d+) \s+ (?P<y2> \d+) \s* \]#imsx', $object_data, $match ) )
  8705. {
  8706. $width = ( double ) ( $match [ 'x2' ] - $match [ 'x1' ] + 1 ) ;
  8707. $height = ( double ) ( $match [ 'y2' ] - $match [ 'y1' ] + 1 ) ;
  8708. }
  8709. // Otherwise, fix an arbitrary width and length (but this should never happen, because all pdf files are correct, isn't it?)
  8710. else
  8711. {
  8712. $width = 595 ;
  8713. $height = 850 ;
  8714. }
  8715.  
  8716. // Yes ! some /Contents parameters may designate another object which contains references to the real text contents
  8717. // in the form : [x 0 R y 0 R etc.], so we have to dig into it...
  8718. $new_references = array ( ) ;
  8719.  
  8720. foreach ( $references as $reference )
  8721. {
  8722. // We just need to check that the object contains something like :
  8723. // [x 0 R y 0 R ...]
  8724. // and nothing more
  8725. if ( isset ( $pdf_objects [ $reference ] ) && preg_match ( '#^\s* \[ [^]]+ \]#x', $pdf_objects [ $reference ] ) &&
  8726. $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $nested_references ) )
  8727. $new_references = array_merge ( $new_references, $nested_references ) ;
  8728. else
  8729. $new_references [] = $reference ;
  8730. }
  8731.  
  8732. $this -> PageContents [ $object_id ] = array
  8733. (
  8734. 'object' => $object_id,
  8735. 'parent' => $parent,
  8736. 'contents' => $new_references,
  8737. 'fonts' => $fonts,
  8738. 'xobjects' => $xobjects,
  8739. 'width' => $width,
  8740. 'height' => $height
  8741. ) ;
  8742. }
  8743. }
  8744. // None of the above, but object contains /Xobject's and maybe more...
  8745. else if ( preg_match ( '#/Type \s* /XObject\b#ix', $object_data ) )
  8746. {
  8747. preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
  8748. $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ;
  8749. $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
  8750. $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
  8751.  
  8752. $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) ;
  8753.  
  8754. $this -> PageContents [ $object_id ] = array
  8755. (
  8756. 'object' => $object_id,
  8757. 'parent' => $parent,
  8758. 'contents' => $references,
  8759. 'fonts' => $fonts,
  8760. 'xobjects' => $xobjects
  8761. ) ;
  8762. }
  8763. }
  8764.  
  8765.  
  8766. /*--------------------------------------------------------------------------------------------------------------
  8767.  
  8768. NAME
  8769. ProcessTemplateReferences - Replace template references with actual text contents.
  8770.  
  8771. PROTOTYPE
  8772. $text = $pagemap -> ReplaceTemplateReferences ( $page_number, $text_data ) ;
  8773.  
  8774. DESCRIPTION
  8775. Replaces template references of the form "/TPLx Do" with the actual text contents.
  8776.  
  8777. PARAMETERS
  8778. $page_number (integer) -
  8779. Page number of the object that contains the supplied object data.
  8780.  
  8781. $text_data (string)
  8782. Text drawing instructions that are to be processed.
  8783.  
  8784. RETURN VALUE
  8785. Returns the original text, where all template references have been replaced with the contents of the
  8786. object they refer to.
  8787.  
  8788. *-------------------------------------------------------------------------------------------------------------*/
  8789. public function ProcessTemplateReferences ( $page_number, $text_data )
  8790. {
  8791. // Many paranoid checks in this piece of code...
  8792. if ( isset ( $this -> Pages [ $page_number ] ) )
  8793. {
  8794. // Loop through the PageContents array to find which one(s) may be subject to template reference replacements
  8795. foreach ( $this -> PageContents as $page_contents )
  8796. {
  8797. // If the current object relates to the specified page number, AND it has xobjects, then the supplied text data
  8798. // may contain template reference of the form : /TPLx.
  8799. // In this case, we replace such a reference with the actual contents of the object they refer to
  8800. if ( isset ( $page_contents [ 'page' ] ) && $page_contents [ 'page' ] == $page_number && count ( $page_contents [ 'xobjects' ] ) )
  8801. {
  8802. $template_searches = array ( ) ;
  8803. $template_replacements = array ( ) ;
  8804.  
  8805. $this -> __get_replacements ( $page_contents, $template_searches, $template_replacements ) ;
  8806. $text_data = self::PregStrReplace ( $template_searches, $template_replacements, $text_data ) ;
  8807. }
  8808. }
  8809. }
  8810.  
  8811. return ( $text_data ) ;
  8812. }
  8813.  
  8814.  
  8815. // __get_replacements -
  8816. // Recursively gets the search/replacement strings for template references.
  8817. private function __get_replacements ( $page_contents, &$searches, &$replacements, $objects_seen = array ( ) )
  8818. {
  8819. foreach ( $page_contents [ 'xobjects' ] as $template_name => $template_object )
  8820. {
  8821. if ( isset ( $this -> TemplateObjects [ $template_object ] ) && ! isset ( $objects_seen [ $template_object ] ) )
  8822. {
  8823. $template = $this -> TemplateObjects [ $template_object ] ;
  8824. $searches [] = '#(' . $template_name . ' \s+ Do\b )#msx' ;
  8825. $replacements [] = '!PDFTOTEXT_TEMPLATE_' . substr ( $template_name, 1 ) . ' ' . $template ;
  8826. $objects_seen [ $template_object ] = $template_object ;
  8827.  
  8828. if ( isset ( $this -> PageContents [ $template_object ] ) )
  8829. $this -> __get_replacements ( $this -> PageContents [ $template_object ], $searches, $replacements, $objects_seen ) ;
  8830. }
  8831. }
  8832. }
  8833.  
  8834.  
  8835.  
  8836. /*--------------------------------------------------------------------------------------------------------------
  8837.  
  8838. NAME
  8839. MapObjects - Builds a correspondance between object and page numbers.
  8840.  
  8841. PROTOTYPE
  8842. $pagemap -> MapObjects ( ) ;
  8843.  
  8844. DESCRIPTION
  8845. Builds a correspondance between object and page numbers. The page number corresponding to an object id
  8846. will after that be available using the array notation.
  8847.  
  8848. NOTES
  8849. This method behaves as if there could be more than one page catalog in the same file, but I've not yet
  8850. encountered this case.
  8851.  
  8852. *-------------------------------------------------------------------------------------------------------------*/
  8853. public function MapObjects ( $objects )
  8854. {
  8855. $kid_count = count ( $this -> PageKids ) ;
  8856.  
  8857. // PDF files created short after the birth of Earth may have neither a page catalog nor page contents descriptions
  8858. if ( ! count ( $this -> PageCatalogs ) )
  8859. {
  8860. // Later, during Pleistocen, references to page kids started to appear...
  8861. if ( $kid_count )
  8862. {
  8863. foreach ( array_keys ( $this -> PageKids ) as $catalog )
  8864. $this -> MapKids ( $catalog, $current_page ) ;
  8865. }
  8866. else
  8867. $this -> Pages [1] = array_keys ( $objects ) ;
  8868. }
  8869. // This is the ideal situation : there is a catalog that allows us to gather indirectly all page data
  8870. else
  8871. {
  8872. $current_page = 1 ;
  8873.  
  8874. foreach ( $this -> PageCatalogs as $catalog )
  8875. {
  8876. if ( isset ( $this -> PageKids [ $catalog ] ) )
  8877. $this -> MapKids ( $catalog, $current_page ) ;
  8878. // Well, almost ideal : it may happen that the page catalog refers to a non-existing object :
  8879. // in this case, we behave the same as if there were no page catalog at all : group everything
  8880. // onto one page
  8881. else
  8882. $this -> Pages [1] = array_keys ( $objects ) ;
  8883. }
  8884. }
  8885. }
  8886.  
  8887.  
  8888. /*--------------------------------------------------------------------------------------------------------------
  8889.  
  8890. NAME
  8891. MapKids - Establishes a correspondance between page kids and a current page number.
  8892.  
  8893. PROTOTYPE
  8894. $pagemap -> MapObjects ( $catalog, &$page ) ;
  8895.  
  8896. DESCRIPTION
  8897. Tries to assign a page number to all page description objects that have been collected by the Peek()
  8898. method.
  8899. Also creates the Pages associative array, whose keys are page numbers and whose values are the ids of
  8900. the objects that the page contains.
  8901.  
  8902. EXAMPLE
  8903. The following example gives an overview of a possible layout for page catalogs ; it describes which
  8904. objects contain what.
  8905. Lines starting with "#x", where "x" is a number, stands for a PDF object definition, which will start
  8906. with "x 0 obj" in the PDF file.
  8907. Whenever numbers are referenced (other than those prefixed with a "#"), it means "reference to the
  8908. specified object.
  8909. For example, "54" will refer to object #54, and will be given as "54 0 R" in the PDF file.
  8910. The numbers at the beginning of each line are just "step numbers", which will be referenced in the
  8911. explanations after the example :
  8912.  
  8913. (01) #1 : /Type/Catalog /Pages 54
  8914. (02) -> #54 : /Type/Pages /Kids[3 28 32 58] /Count 5
  8915. (03) -> #3 : /Type/Page /Parent 54 /Contents[26]
  8916. (04) -> #26 : page contents
  8917. (05) -> #28 : /Type/Page /Parent 54 /Contents[30 100 101 102 103 104]
  8918. (06) -> #30 : page contents
  8919. (07) -> #32 : /Type/Page /Parent 54 /Contents[34]
  8920. (08) -> #34 : page contents
  8921. (09) -> #58 : /Type/Pages /Parent 54 /Count 2 /Kids[36 40]
  8922. (10) -> #36 : /Type/Page /Parent 58 /Contents[38]
  8923. (11) -> #38 : page contents
  8924. (12) -> #40 : /Type/Page /Parent 58 /Contents[42]
  8925. (13) -> #42 : page contents
  8926.  
  8927. Explanations :
  8928. (01) Object #1 contains the page catalog ; it states that a further description of the page
  8929. contents is given by object #54.
  8930. Note that it could reference multiple page descriptions, such as : /Pages [54 68 99...]
  8931. (although I did not met the case so far)
  8932. (02) Object #54 in turn says that it as "kids", described by objects #3, #28, #32 and #58. It
  8933. also says that it has 5 pages (/Count parameter) ; but wait... the /Kids parameter references
  8934. 4 objects while the /Count parameter states that we have 5 pages : what happens ? we will
  8935. discover it in the explanations below.
  8936. (03) Object #3 states that it is aimed for page description (/Type/Page) ; the page contents
  8937. will be found in object #26, specified after the /Contents parameter. Note that here again,
  8938. multiple objects could be referenced by the /Contents parameter but, in our case, there is
  8939. only one, 26. Object #3 also says that its parent object (in the page catalog) is object
  8940. #54, defined in (01).
  8941. Since this is the first page we met, it will have page number 1.
  8942. (04) ... object #26 contains the Postscript instructions to draw page #1
  8943. (05) Object #28 has the same type as #3 ; its page contents can be located in object #30 (06)
  8944. The same applies for object #32 (07), whose page contents are given by object #34 (08).
  8945. So, (05) and (07) will be pages 2 and 3, respectively.
  8946. (09) Now, it starts to become interesting : object #58 does not directly lead to an object
  8947. containing Postscript instructions as did objects #3, #28 and #32 whose parent is #54, but
  8948. to yet another page catalog which contains 2 pages (/Count 2), described by objects #36 and
  8949. #40. It's not located at the same position as object #54 in the hierarchy, so it shows that
  8950. page content descriptions can be recursively nested.
  8951. (10) Object #36 says that we will find the page contents in object #38 (which will be page 4)
  8952. (12) ... and object #40 says that we will find the page contents in object #42 (and our final
  8953. page, 5)
  8954.  
  8955. *-------------------------------------------------------------------------------------------------------------*/
  8956. protected function MapKids ( $catalog, &$page )
  8957. {
  8958. if ( ! isset ( $this -> PageKids [ $catalog ] ) )
  8959. return ;
  8960.  
  8961. $entry = $this -> PageKids [ $catalog ] ;
  8962.  
  8963. // The PDF file contains an object containing a /Type/Pages/Kids[] construct, specified by another object containing a
  8964. // /Type/Catalog/Pages construct : we will rely on its contents to find which page contains what
  8965. if ( isset ( $this -> PageContents [ $entry [ 'kids' ] [0] ] ) )
  8966. {
  8967. foreach ( $entry [ 'kids' ] as $item )
  8968. {
  8969. // Some objects given by a /Page /Contents[] construct do not directly lead to an object describing PDF contents,
  8970. // but rather to an object containing in turn a /Pages /Kids[] construct ; this adds a level of indirection, and
  8971. // we have to recursively process it
  8972. if ( isset ( $this -> PageKids [ $item ] ) )
  8973. {
  8974. $this -> MapKids ( $item, $page ) ;
  8975. }
  8976. // The referenced object actually defines page contents (no indirection)
  8977. else
  8978. {
  8979. $this -> PageContents [ $item ] [ 'page' ] = $page ;
  8980. $this -> Pages [ $page ] = ( isset ( $this -> PageContents [ $item ] [ 'contents' ] ) ) ?
  8981. $this -> PageContents [ $item ] [ 'contents' ] : array ( ) ;
  8982. if ( isset ( $this -> PageContents [ $item ] [ 'width' ] ) )
  8983. {
  8984. $this -> PageAttributes [ $page ] = array
  8985. (
  8986. 'width' => $this -> PageContents [ $item ] [ 'width' ],
  8987. 'height' => $this -> PageContents [ $item ] [ 'height' ]
  8988. ) ;
  8989. }
  8990.  
  8991. $page ++ ;
  8992. }
  8993. }
  8994. }
  8995. // No page catalog at all : consider everything is on the same page (this class does not use the WheresMyCrystalBall trait)
  8996. else
  8997. {
  8998. foreach ( $entry [ 'kids' ] as $kid )
  8999. $this -> MapKids ( $kid, $page ) ;
  9000. }
  9001. }
  9002.  
  9003.  
  9004. /*--------------------------------------------------------------------------------------------------------------
  9005.  
  9006. NAME
  9007. GetMappedFonts - Retrieves the mapped fonts per page
  9008.  
  9009. PROTOTYPE
  9010. $array = $pagemap -> GetMappedFonts ( ) ;
  9011.  
  9012. DESCRIPTION
  9013. Gets the mapped fonts, per page. XObjects are traversed, to retrieved additional font aliases defined
  9014. by them.
  9015. This function is used by the PdfTexter class to add additional entries to the FontMap object,
  9016. ensuring that each reference to a font remains local to a page.
  9017.  
  9018. RETURN VALUE
  9019. Returns an array of associative arrays which have the following entries :
  9020. - 'page' :
  9021. Page number.
  9022. - 'xobject-name' :
  9023. XObject name, that can define further font aliases. This entry is set to the empty string for
  9024. global font aliases.
  9025. - 'font-name' :
  9026. Font name (eg, "/F1", "/C1_0", etc.).
  9027. - 'object' :
  9028. Object defining the font attributes, such as character map, etc.
  9029.  
  9030. *-------------------------------------------------------------------------------------------------------------*/
  9031. public function GetMappedFonts ( )
  9032. {
  9033. $mapped_fonts = array ( ) ;
  9034. $current_page = 0 ;
  9035.  
  9036. foreach ( $this -> PageCatalogs as $catalog )
  9037. {
  9038. if ( ! isset ( $this -> PageKids [ $catalog ] ) )
  9039. continue ;
  9040.  
  9041. foreach ( $this -> PageKids [ $catalog ] [ 'kids' ] as $page_object )
  9042. {
  9043. $current_page ++ ;
  9044.  
  9045. if ( isset ( $this -> PageContents [ $page_object ] ) )
  9046. {
  9047. $page_contents = $this -> PageContents [ $page_object ] ;
  9048. $associations = array ( ) ;
  9049.  
  9050. if ( isset ( $page_contents [ 'fonts' ] ) )
  9051. {
  9052. foreach ( $page_contents [ 'fonts' ] as $font_name => $font_object )
  9053. {
  9054. $mapped_fonts [] = array
  9055. (
  9056. 'page' => $current_page,
  9057. 'xobject-name' => '',
  9058. 'font-name' => $font_name,
  9059. 'object' => $font_object
  9060. ) ;
  9061.  
  9062. $associations [ ":$font_name" ] = $font_object ;
  9063.  
  9064. $this -> __map_recursive ( $current_page, $page_contents [ 'xobjects' ], $mapped_fonts, $associations ) ;
  9065. }
  9066. }
  9067. }
  9068. }
  9069. }
  9070.  
  9071. return ( $mapped_fonts ) ;
  9072. }
  9073.  
  9074.  
  9075. // __map_recursive -
  9076. // Recursively collects font aliases for XObjects.
  9077. private function __map_recursive ( $page_number, $xobjects, &$mapped_fonts, &$associations )
  9078. {
  9079. foreach ( $xobjects as $xobject_name => $xobject_value )
  9080. {
  9081. if ( isset ( $this -> PageContents [ $xobject_value ] ) )
  9082. {
  9083. foreach ( $this -> PageContents [ $xobject_value ] [ 'fonts' ] as $font_name => $font_object )
  9084. {
  9085. if ( ! isset ( $associations [ "$xobject_name:$font_name" ] ) )
  9086. {
  9087. $mapped_fonts [] = array
  9088. (
  9089. 'page' => $page_number,
  9090. 'xobject-name' => $xobject_name,
  9091. 'font-name' => $font_name,
  9092. 'object' => $font_object
  9093. ) ;
  9094.  
  9095. $associations [ "$xobject_name:$font_name" ] = $font_object ;
  9096. }
  9097. }
  9098.  
  9099. $this -> XObjectNames [ $xobject_name ] = 1 ;
  9100. $this -> __map_recursive ( $page_number, $this -> PageContents [ $xobject_value ] [ 'xobjects' ], $mapped_fonts, $associations ) ;
  9101. }
  9102. }
  9103. }
  9104.  
  9105.  
  9106.  
  9107. /*--------------------------------------------------------------------------------------------------------------
  9108.  
  9109. NAME
  9110. IsValidXObject - Checks if the specified object is a valid XObject.
  9111.  
  9112. PROTOTYPE
  9113. $status = $pagemap -> IsValidXObjectName ( $name ) ;
  9114.  
  9115. DESCRIPTION
  9116. Checks if the specified name is a valid XObject defining its own set of font aliases.
  9117.  
  9118. PARAMETERS
  9119. $name (string) -
  9120. Name of the XObject to be checked.
  9121.  
  9122. RETURN VALUE
  9123. Returns true if the specified XObject exists and defines its own set of font aliases, false otherwise.
  9124.  
  9125. *-------------------------------------------------------------------------------------------------------------*/
  9126. public function IsValidXObjectName ( $name )
  9127. { return ( isset ( $this -> XObjectNames [ $name ] ) ) ; }
  9128. }
  9129.  
  9130.  
  9131. /**************************************************************************************************************
  9132. **************************************************************************************************************
  9133. **************************************************************************************************************
  9134. ****** ******
  9135. ****** ******
  9136. ****** IMAGE MANAGEMENT ******
  9137. ****** ******
  9138. ****** ******
  9139. **************************************************************************************************************
  9140. **************************************************************************************************************
  9141. **************************************************************************************************************/
  9142.  
  9143. /*==============================================================================================================
  9144.  
  9145. class PdfImage -
  9146. Holds image data coming from pdf.
  9147.  
  9148. ==============================================================================================================*/
  9149. abstract class PdfImage extends PdfObjectBase
  9150. {
  9151. // Image resource that can be used to process image data, using the php imagexxx() functions
  9152. public $ImageResource = false ;
  9153. // Original image data
  9154. protected $ImageData ;
  9155. // Tells if the image resource has been created - false when the autosave feature is on and the image is pure JPEG data
  9156. protected $NoResourceCreated ;
  9157.  
  9158.  
  9159. /*--------------------------------------------------------------------------------------------------------------
  9160.  
  9161. CONSTRUCTOR
  9162. Creates a PdfImage object with a resource that can be used with imagexxx() php functions.
  9163.  
  9164. *-------------------------------------------------------------------------------------------------------------*/
  9165. public function __construct ( $image_data, $no_resource_created = false )
  9166. {
  9167. $this -> ImageData = $image_data ;
  9168. $this -> NoResourceCreated = $no_resource_created ;
  9169.  
  9170. if ( ! $no_resource_created )
  9171. $this -> ImageResource = $this -> CreateImageResource ( $image_data ) ;
  9172. }
  9173.  
  9174.  
  9175. /*--------------------------------------------------------------------------------------------------------------
  9176.  
  9177. DESTRUCTOR
  9178. Destroys the associated image resource.
  9179.  
  9180. *-------------------------------------------------------------------------------------------------------------*/
  9181. public function __destruct ( )
  9182. {
  9183. $this -> DestroyImageResource ( ) ;
  9184. }
  9185.  
  9186.  
  9187. /*--------------------------------------------------------------------------------------------------------------
  9188.  
  9189. NAME
  9190. CreateImageResource - creates an image resource from the supplied image data.
  9191.  
  9192. PROTOTYPE
  9193. $resource = $this -> CreateImageResource ( $data ) ;
  9194.  
  9195. DESCRIPTION
  9196. Creates an image resource from the supplied image data.
  9197. Whatever the input format, the internal format will be the one used by the gd library.
  9198.  
  9199. PARAMETERS
  9200. $data (string) -
  9201. Image data.
  9202.  
  9203. *-------------------------------------------------------------------------------------------------------------*/
  9204. abstract protected function CreateImageResource ( $image_data ) ;
  9205.  
  9206.  
  9207. /*--------------------------------------------------------------------------------------------------------------
  9208.  
  9209. NAME
  9210. DestroyImageResource - Destroys the allocated image resource.
  9211.  
  9212. PROTOTYPE
  9213. $this -> DestroyImageResource ( ) ;
  9214.  
  9215. DESCRIPTION
  9216. Destroys the allocated image resource, using the libgd imagedestroy() function. This method can be
  9217. overridden by derived class if the underlying image resource does not come from the gd lib.
  9218.  
  9219. *-------------------------------------------------------------------------------------------------------------*/
  9220. protected function DestroyImageResource ( )
  9221. {
  9222. if ( $this -> ImageResource )
  9223. imagedestroy ( $this -> ImageResource ) ;
  9224. }
  9225.  
  9226.  
  9227. /*--------------------------------------------------------------------------------------------------------------
  9228.  
  9229. NAME
  9230. SaveAs - Saves the current image to a file.
  9231.  
  9232. PROTOTYPE
  9233. $pdfimage -> SaveAs ( $output_file, $image_type = IMG_JPEG ) ;
  9234.  
  9235. DESCRIPTION
  9236. Saves the current image resource to the specified output file, in the specified format.
  9237.  
  9238. PARAMETERS
  9239. $output_file (string) -
  9240. Output filename.
  9241.  
  9242. $image_type (integer) -
  9243. Output format. Can be any of the predefined php constants IMG_*.
  9244.  
  9245. *-------------------------------------------------------------------------------------------------------------*/
  9246. public function SaveAs ( $output_file, $image_type = IMG_JPEG )
  9247. {
  9248. if ( ! $this -> ImageResource )
  9249. {
  9250. if ( $this -> NoResourceCreated && $image_type == IMG_JPEG )
  9251. file_put_contents ( $output_file, $this -> ImageData ) ;
  9252. else if ( PdfToText::$DEBUG )
  9253. warning ( new PdfToTextDecodingException ( "No image resource allocated." ) ) ;
  9254.  
  9255. return ;
  9256. }
  9257.  
  9258. $image_types = imagetypes ( ) ;
  9259.  
  9260. switch ( $image_type )
  9261. {
  9262. case IMG_JPEG :
  9263. case IMG_JPG :
  9264. if ( ! ( $image_types & IMG_JPEG ) && ! ( $image_types & IMG_JPG ) )
  9265. error ( new PdfToTextDecodingException ( "Your current PHP version does not support JPG images." ) ) ;
  9266.  
  9267. imagejpeg ( $this -> ImageResource, $output_file, 100 ) ;
  9268. break ;
  9269.  
  9270. case IMG_GIF :
  9271. if ( ! ( $image_types & IMG_GIF ) )
  9272. error ( new PdfToTextDecodingException ( "Your current PHP version does not support GIF images." ) ) ;
  9273.  
  9274. imagegif ( $this -> ImageResource, $output_file ) ;
  9275. break ;
  9276.  
  9277. case IMG_PNG :
  9278. if ( ! ( $image_types & IMG_PNG ) )
  9279. error ( new PdfToTextDecodingException ( "Your current PHP version does not support PNG images." ) ) ;
  9280.  
  9281. imagepng ( $this -> ImageResource, $output_file, 0 ) ;
  9282. break ;
  9283.  
  9284. case IMG_WBMP :
  9285. if ( ! ( $image_types & IMG_WBMP ) )
  9286. error ( new PdfToTextDecodingException ( "Your current PHP version does not support WBMP images." ) ) ;
  9287.  
  9288. imagewbmp ( $this -> ImageResource, $output_file ) ;
  9289. break ;
  9290.  
  9291. case IMG_XPM :
  9292. if ( ! ( $image_types & IMG_XPM ) )
  9293. error ( new PdfToTextDecodingException ( "Your current PHP version does not support XPM images." ) ) ;
  9294.  
  9295. imagexbm ( $this -> ImageResource, $output_file ) ;
  9296. break ;
  9297.  
  9298. default :
  9299. error ( new PdfToTextDecodingException ( "Unknown image type #$image_type." ) ) ;
  9300. }
  9301. }
  9302.  
  9303.  
  9304. public function Output ( )
  9305. {
  9306. $this -> SaveAs ( null ) ;
  9307. }
  9308. }
  9309.  
  9310.  
  9311.  
  9312. /*==============================================================================================================
  9313.  
  9314. class PdfJpegImage -
  9315. Handles encoded JPG images.
  9316.  
  9317. ==============================================================================================================*/
  9318. class PdfJpegImage extends PdfImage
  9319. {
  9320. public function __construct ( $image_data, $autosave )
  9321. {
  9322. parent::__construct ( $image_data, $autosave ) ;
  9323. }
  9324.  
  9325.  
  9326. protected function CreateImageResource ( $image_data )
  9327. {
  9328. return ( imagecreatefromstring ( $image_data ) ) ;
  9329. }
  9330. }
  9331.  
  9332.  
  9333. /*==============================================================================================================
  9334.  
  9335. class PdfInlinedImage -
  9336. Decodes raw image data in objects having the /FlateDecode flag.
  9337.  
  9338. ==============================================================================================================*/
  9339. class PdfInlinedImage extends PdfImage
  9340. {
  9341. // Supported color schemes
  9342. const COLOR_SCHEME_RGB = 1 ;
  9343. const COLOR_SCHEME_CMYK = 2 ;
  9344. const COLOR_SCHEME_GRAY = 3 ;
  9345.  
  9346. // Color scheme names, for debugging only
  9347. private static $DecoderNames = array
  9348. (
  9349. self::COLOR_SCHEME_RGB => 'RGB',
  9350. self::COLOR_SCHEME_CMYK => 'CMYK',
  9351. self::COLOR_SCHEME_GRAY => 'Gray'
  9352. ) ;
  9353.  
  9354. // Currently implemented image decoders
  9355. private static $Decoders = array
  9356. (
  9357. self::COLOR_SCHEME_RGB => array
  9358. (
  9359. 8 => '__decode_rgb8'
  9360. ),
  9361. self::COLOR_SCHEME_GRAY => array
  9362. (
  9363. 8 => '__decode_gray8'
  9364. ),
  9365. self::COLOR_SCHEME_CMYK => array
  9366. (
  9367. 8 => '__decode_cmyk8'
  9368. ),
  9369. ) ;
  9370.  
  9371. // Image width and height
  9372. public $Width,
  9373. $Height ;
  9374. // Color scheme
  9375. public $ColorScheme ;
  9376. // Number of bits per color component
  9377. public $BitsPerComponent ;
  9378. // Decoding function, varying upon the supplied image type
  9379. public $DecodingFunction = false ;
  9380.  
  9381.  
  9382. /*--------------------------------------------------------------------------------------------------------------
  9383.  
  9384. NAME
  9385. Constructor - Builds an image from the supplied data.
  9386.  
  9387. PROTOTYPE
  9388. $image = new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_scheme ) ;
  9389.  
  9390. DESCRIPTION
  9391. Builds an image from the supplied data. Checks that the image flags are supported.
  9392.  
  9393. PARAMETERS
  9394. $image_data (string) -
  9395. Uncompressed image data.
  9396.  
  9397. $width (integer) -
  9398. Image width, in pixels.
  9399.  
  9400. $height (integer) -
  9401. Image height, in pixels.
  9402.  
  9403. $bits_per_components (integer) -
  9404. Number of bits per color component.
  9405.  
  9406. $color_scheme (integer) -
  9407. One of the COLOR_SCHEME_* constants, specifying the initial data format.
  9408.  
  9409. NOTES
  9410. Processed images are always converted to JPEG format.
  9411.  
  9412. *-------------------------------------------------------------------------------------------------------------*/
  9413. public function __construct ( $image_data, $width, $height, $bits_per_component, $color_scheme )
  9414. {
  9415. $this -> Width = $width ;
  9416. $this -> Height = $height ;
  9417. $this -> BitsPerComponent = $bits_per_component ;
  9418. $this -> ColorScheme = $color_scheme ;
  9419.  
  9420. // Check that we have a decoding function for the supplied parameters
  9421. if ( isset ( self::$Decoders [ $color_scheme ] ) )
  9422. {
  9423. if ( isset ( self::$Decoders [ $color_scheme ] [ $bits_per_component ] ) )
  9424. $this -> DecodingFunction = self::$Decoders [ $color_scheme ] [ $bits_per_component ] ;
  9425. else
  9426. error ( new PdfToTextDecodingException ( "No decoding function has been implemented for image objects having the " .
  9427. self::$DecoderNames [ $color_scheme ] . " color scheme with $bits_per_component bits per color component." ) ) ;
  9428. }
  9429. else
  9430. error ( new PdfToTextDecodingException ( "Unknown color scheme $color_scheme." ) ) ;
  9431.  
  9432. parent::__construct ( $image_data ) ;
  9433. }
  9434.  
  9435.  
  9436. /*--------------------------------------------------------------------------------------------------------------
  9437.  
  9438. NAME
  9439. CreateInstance - Creates an appropriate instance of a PdfImage class.
  9440.  
  9441. PROTOTYPE
  9442. $image = PdfInlinedImage ( $stream_data, $object_data ) ;
  9443.  
  9444. DESCRIPTION
  9445. Creates an instance of either :
  9446. - A PdfJpegImage class, if the image specifications in $object_data indicate that the compressed stream
  9447. contents are only JPEG data
  9448. - A PdfInlinedImage class, if the image specifications state that the compressed stream data contain
  9449. only color values.
  9450.  
  9451. The class currently supports (in $stream_data) :
  9452. - Pure JPEG contents
  9453. - RGB values
  9454. - CMYK values
  9455. - Gray scale values (in the current version, the resulting image does not correctly reproduce the
  9456. initial colors, if interpolation is to be used).
  9457.  
  9458. PARAMETERS
  9459. $stream_data (string) -
  9460. Compressed image data.
  9461.  
  9462. $object_data (string) -
  9463. Object containing the stream data.
  9464.  
  9465. RETURN VALUE
  9466. Returns :
  9467. - A PdfJpegImage object, if the stream data contains only pure JPEG contents
  9468. - A PdfInlinedImage object, in other cases.
  9469. - False if the supplied image data is not currently supported.
  9470.  
  9471. *-------------------------------------------------------------------------------------------------------------*/
  9472. public static function CreateInstance ( $stream_data, $object_data, $autosave )
  9473. {
  9474. // Remove stream data from the supplied object data, to speed up the searches below
  9475. $index = strpos ( $object_data, 'stream' ) ;
  9476.  
  9477. if ( $index !== false )
  9478. $object_data = substr ( $object_data, 0, $index ) ;
  9479.  
  9480. // Uncompress stream data
  9481. $image_data = gzuncompress ( $stream_data ) ;
  9482.  
  9483. // The /DCTDecode flag indicates JPEG contents - returns a PdfJpegImage object
  9484. if ( stripos ( $object_data, '/DCTDecode' ) )
  9485. return ( new PdfJpegImage ( $image_data, $autosave ) ) ;
  9486.  
  9487. // Get the image width & height
  9488. $match = null ;
  9489. preg_match ( '#/Width \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
  9490. $width = ( integer ) $match [ 'value' ] ;
  9491.  
  9492. $match = null ;
  9493. preg_match ( '#/Height \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
  9494. $height = ( integer ) $match [ 'value' ] ;
  9495.  
  9496. // Get the number of bits per color component
  9497. $match = null ;
  9498. preg_match ( '#/BitsPerComponent \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
  9499. $bits_per_component = ( integer ) $match [ 'value' ] ;
  9500.  
  9501. // Get the target color space
  9502. // Sometimes, this refers to an object in the PDF file, which can also be embedded in a compound object
  9503. // We don't handle such cases for now
  9504. $match = null ;
  9505. preg_match ( '#/ColorSpace \s* / (?P<value> \w+)#ix', $object_data, $match ) ;
  9506.  
  9507. if ( ! isset ( $match [ 'value' ] ) )
  9508. return ( false ) ;
  9509.  
  9510. $color_space_name = $match [ 'value' ] ;
  9511.  
  9512. // Check that we are able to handle the specified color space
  9513. switch ( strtolower ( $color_space_name ) )
  9514. {
  9515. case 'devicergb' :
  9516. $color_space = self::COLOR_SCHEME_RGB ;
  9517. break ;
  9518.  
  9519. case 'devicegray' :
  9520. $color_space = self::COLOR_SCHEME_GRAY ;
  9521. break ;
  9522.  
  9523. case 'devicecmyk' :
  9524. $color_space = self::COLOR_SCHEME_CMYK ;
  9525. break ;
  9526.  
  9527. default :
  9528. if ( PdfToText::$DEBUG )
  9529. warning ( new PdfToTextDecodingException ( "Unsupported color space \"$color_space_name\"." ) ) ;
  9530.  
  9531. return ( false ) ;
  9532. }
  9533.  
  9534. // Also check that we can handle the specified number of bits per component
  9535. switch ( $bits_per_component )
  9536. {
  9537. case 8 :
  9538. break ;
  9539.  
  9540. default :
  9541. if ( PdfToText::$DEBUG )
  9542. warning ( new PdfToTextDecodingException ( "Unsupported bits per component : $bits_per_component." ) ) ;
  9543.  
  9544. return ( false ) ;
  9545. }
  9546.  
  9547. // All done, return a PdfInlinedImage object
  9548. return ( new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_space ) ) ;
  9549. }
  9550.  
  9551.  
  9552. /*--------------------------------------------------------------------------------------------------------------
  9553.  
  9554. NAME
  9555. CreateImageResource - Creates the image resource.
  9556.  
  9557. PROTOTYPE
  9558. $resource = $image -> CreateImageResource ( $image_data ) ;
  9559.  
  9560. DESCRIPTION
  9561. Creates a GD image according to the supplied image data, and the parameters supplied to the class
  9562. constructor.
  9563.  
  9564. PARAMETERS
  9565. $image_data (string) -
  9566. Image to be decoded.
  9567.  
  9568. RETURN VALUE
  9569. Returns a GD graphics resource in true color, or false if there is currently no implemented decoding
  9570. function for this kind of images.
  9571.  
  9572. *-------------------------------------------------------------------------------------------------------------*/
  9573. protected function CreateImageResource ( $image_data )
  9574. {
  9575. $decoder = $this -> DecodingFunction ;
  9576.  
  9577. if ( $decoder )
  9578. return ( $this -> $decoder ( $image_data ) ) ;
  9579. else
  9580. return ( false ) ;
  9581. }
  9582.  
  9583.  
  9584. /*--------------------------------------------------------------------------------------------------------------
  9585.  
  9586. Decoding functions.
  9587.  
  9588. *-------------------------------------------------------------------------------------------------------------*/
  9589.  
  9590. // __decode_rgb8 -
  9591. // Decodes image data consisting of 8-bits RGB values (one byte for each color component).
  9592. private function __decode_rgb8 ( $data )
  9593. {
  9594. $data_length = strlen ( $data ) ;
  9595. $colors = array ( ) ;
  9596. $width = $this -> Width ;
  9597. $height = $this -> Height ;
  9598. $image = imagecreatetruecolor ( $width, $height ) ;
  9599.  
  9600. for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 3 <= $data_length ; $i += 3, $pixel_x ++ )
  9601. {
  9602. $red = ord ( $data [$i] ) ;
  9603. $green = ord ( $data [$i+1] ) ;
  9604. $blue = ord ( $data [$i+2] ) ;
  9605.  
  9606. $color = ( $red << 16 ) | ( $green << 8 ) | ( $blue ) ;
  9607.  
  9608. if ( isset ( $colors [ $color ] ) )
  9609. $pixel_color = $colors [ $color ] ;
  9610. else
  9611. {
  9612. $pixel_color = imagecolorallocate ( $image, $red, $green, $blue ) ;
  9613. $colors [ $color ] = $pixel_color ;
  9614. }
  9615.  
  9616. if ( $pixel_x >= $width )
  9617. {
  9618. $pixel_x = 0 ;
  9619. $pixel_y ++ ;
  9620. }
  9621.  
  9622. imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
  9623. }
  9624.  
  9625. return ( $image ) ;
  9626. }
  9627.  
  9628.  
  9629. // __decode_cmyk8 -
  9630. // Decodes image data consisting of 8-bits CMYK values (one byte for each color component).
  9631. private function __decode_cmyk8 ( $data )
  9632. {
  9633. $data_length = strlen ( $data ) ;
  9634. $colors = array ( ) ;
  9635. $width = $this -> Width ;
  9636. $height = $this -> Height ;
  9637. $image = imagecreatetruecolor ( $width, $height ) ;
  9638.  
  9639. for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 4 <= $data_length ; $i += 4, $pixel_x ++ )
  9640. {
  9641. $cyan = ord ( $data [$i] ) ;
  9642. $magenta = ord ( $data [$i+1] ) ;
  9643. $yellow = ord ( $data [$i+2] ) ;
  9644. $black = ord ( $data [$i+3] ) ;
  9645.  
  9646. $color = ( $cyan << 24 ) | ( $magenta << 16 ) | ( $yellow << 8 ) | ( $black ) ;
  9647.  
  9648. if ( isset ( $colors [ $color ] ) )
  9649. $pixel_color = $colors [ $color ] ;
  9650. else
  9651. {
  9652. $rgb = $this -> __convert_cmyk_to_rgb ( $cyan, $magenta, $yellow, $black ) ;
  9653. $pixel_color = imagecolorallocate ( $image, $rgb [0], $rgb [1], $rgb [2] ) ;
  9654. $colors [ $color ] = $pixel_color ;
  9655. }
  9656.  
  9657. if ( $pixel_x >= $width )
  9658. {
  9659. $pixel_x = 0 ;
  9660. $pixel_y ++ ;
  9661. }
  9662.  
  9663. imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
  9664. }
  9665.  
  9666. return ( $image ) ;
  9667. }
  9668.  
  9669.  
  9670. // __decode_gray8 -
  9671. // Decodes image data consisting of 8-bits gray values.
  9672. private function __decode_gray8 ( $data )
  9673. {
  9674. $data_length = strlen ( $data ) ;
  9675. $colors = array ( ) ;
  9676. $width = $this -> Width ;
  9677. $height = $this -> Height ;
  9678. $image = imagecreatetruecolor ( $width, $height ) ;
  9679.  
  9680. for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i < $data_length ; $i ++, $pixel_x ++ )
  9681. {
  9682. $color = ord ( $data [$i] ) ;
  9683.  
  9684. if ( isset ( $colors [ $color ] ) )
  9685. $pixel_color = $colors [ $color ] ;
  9686. else
  9687. {
  9688. $pixel_color = imagecolorallocate ( $image, $color, $color, $color ) ;
  9689. $colors [ $color ] = $pixel_color ;
  9690. }
  9691.  
  9692. if ( $pixel_x >= $width )
  9693. {
  9694. $pixel_x = 0 ;
  9695. $pixel_y ++ ;
  9696. }
  9697.  
  9698. imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
  9699. }
  9700.  
  9701. return ( $image ) ;
  9702. }
  9703.  
  9704.  
  9705. /*--------------------------------------------------------------------------------------------------------------
  9706.  
  9707. Support functions.
  9708.  
  9709. *-------------------------------------------------------------------------------------------------------------*/
  9710.  
  9711. // __convert_cmyk_to_rgb -
  9712. // Converts CMYK color value to RGB.
  9713. private function __convert_cmyk_to_rgb ( $C, $M, $Y, $K )
  9714. {
  9715. if ( $C > 1 || $M > 1 || $Y > 1 || $K > 1 )
  9716. {
  9717. $C /= 100.0 ;
  9718. $M /= 100.0 ;
  9719. $Y /= 100.0 ;
  9720. $K /= 100.0 ;
  9721. }
  9722.  
  9723. $R = ( 1 - $C * ( 1 - $K ) - $K ) * 256 ;
  9724. $G = ( 1 - $M * ( 1 - $K ) - $K ) * 256 ;
  9725. $B = ( 1 - $Y * ( 1 - $K ) - $K ) * 256 ;
  9726.  
  9727. $result = array ( round ( $R ), round ( $G ), round ( $B ) ) ;
  9728.  
  9729. return ( $result ) ;
  9730. }
  9731. }
  9732.  
  9733.  
  9734. /*==============================================================================================================
  9735.  
  9736. class PdfFaxImage -
  9737. Handles encoded CCITT Fax images.
  9738.  
  9739. ==============================================================================================================*/
  9740. class PdfFaxImage extends PdfImage
  9741. {
  9742. public function __construct ( $image_data )
  9743. {
  9744. parent::__construct ( $image_data ) ;
  9745. }
  9746.  
  9747.  
  9748. protected function CreateImageResource ( $image_data )
  9749. {
  9750. warning ( new PdfToTextDecodingException ( "Decoding of CCITT Fax image format is not yet implemented." ) ) ;
  9751. //return ( imagecreatefromstring ( $image_data ) ) ;
  9752. }
  9753. }
  9754.  
  9755.  
  9756. /**************************************************************************************************************
  9757. **************************************************************************************************************
  9758. **************************************************************************************************************
  9759. ****** ******
  9760. ****** ******
  9761. ****** ENCRYPTION MANAGEMENT ******
  9762. ****** ******
  9763. ****** ******
  9764. **************************************************************************************************************
  9765. **************************************************************************************************************
  9766. **************************************************************************************************************/
  9767.  
  9768. /*==============================================================================================================
  9769.  
  9770. class EncryptionData -
  9771. Holds encryption data and allows for decryption.
  9772.  
  9773. ==============================================================================================================*/
  9774. class PdfEncryptionData extends PdfObjectBase
  9775. {
  9776. // Encryption modes
  9777. const PDFMODE_UNKNOWN = 0 ;
  9778. const PDFMODE_STANDARD = 1 ;
  9779.  
  9780. // Encryption algorithms
  9781. const PDFCRYPT_ALGORITHM_RC4 = 0 ;
  9782. const PDFCRYPT_ALGORITHM_AES = 1 ;
  9783. const PDFCRYPT_ALGORITHM_AES256 = 2 ;
  9784.  
  9785. // A 32-bytes hardcoded padding used when computing encryption keys
  9786. const PDF_ENCRYPTION_PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A" ;
  9787.  
  9788. // Permission bits for encrypted files. Comments come from the PDF specification
  9789. const PDFPERM_PRINT = 0x0004 ; // bit 3 :
  9790. // (Revision 2) Print the document.
  9791. // (Revision 3 or greater) Print the document (possibly not at the highest quality level,
  9792. // depending on whether bit 12 is also set).
  9793. const PDFPERM_MODIFY = 0x0008 ; // bit 4 :
  9794. // Modify the contents of the document by operations other than those controlled by bits 6, 9, and 11.
  9795. const PDFPERM_COPY = 0x0010 ; // bit 5 :
  9796. // (Revision 2) Copy or otherwise extract text and graphics from the document, including extracting text
  9797. // and graphics (in support of accessibility to users with disabilities or for other purposes).
  9798. // (Revision 3 or greater) Copy or otherwise extract text and graphics from the document by operations
  9799. // other than that controlled by bit 10.
  9800. const PDFPERM_MODIFY_EXTRA = 0x0020 ; // bit 6 :
  9801. // Add or modify text annotations, fill in interactive form fields, and, if bit 4 is also set,
  9802. // create or modify interactive form fields (including signature fields).
  9803. const PDFPERM_FILL_FORM = 0x0100 ; // bit 9 :
  9804. // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
  9805. // even if bit 6 is clear.
  9806. const PDFPERM_EXTRACT = 0x0200 ; // bit 10 :
  9807. // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
  9808. // even if bit 6 is clear.
  9809. const PDFPERM_ASSEMBLE = 0x0400 ; // bit 11 :
  9810. // (Revision 3 or greater) Assemble the document (insert, rotate, or delete pages and create bookmarks
  9811. // or thumbnail images), even if bit 4 is clear.
  9812. const PDFPERM_HIGH_QUALITY_PRINT = 0x0800 ; // bit 12 :
  9813. // (Revision 3 or greater) Print the document to a representation from which a faithful digital copy of
  9814. // the PDF content could be generated. When this bit is clear (and bit 3 is set), printing is limited to
  9815. // a low-level representation of the appearance, possibly of degraded quality.
  9816.  
  9817. public $FileId ; // File ID, as specified by the /ID flag
  9818. public $ObjectId ; // Object id and text contents
  9819. private $ObjectData ;
  9820. public $Mode ; // Encryption mode - currently, only the "Standard" keyword is accepted
  9821. public $EncryptionAlgorithm ; // Encryption algorithm - one of the PDFCRYPT_* constants
  9822. public $AlgorithmVersion, // Encryption algorithm version & revision
  9823. $AlgorithmRevision ;
  9824. public $Flags ; // Protection flags, when an owner password has been specified - one of the PDFPERM_* constants
  9825. public $KeyLength ; // Encryption key length
  9826. public $UserKey, // User and owner password keys
  9827. $OwnerKey ;
  9828. public $UserEncryptionString, // Not sure yet of the real usage of these ones
  9829. $OwnerEncryptionString ;
  9830. public $EncryptMetadata ; // True if metadata is also encrypted
  9831. public $FileKeyLength ; // Key length / 5
  9832.  
  9833. protected $Decrypter ; // Decrypter object
  9834.  
  9835. private $UnsupportedEncryptionAlgorithm = false ; // True if the encryption algorithm used in the PDF file is not yet supported
  9836.  
  9837.  
  9838. /**************************************************************************************************************
  9839.  
  9840. NAME
  9841. Constructor
  9842.  
  9843. PROTOTYPE
  9844. obj = new PdfEncryptionData ( $mode, $object_id, $object_data ) ;
  9845.  
  9846. DESCRIPTION
  9847. Creates an instance of a PdfEncryptionData class, using the information parsed from the supplied object
  9848. data.
  9849.  
  9850. PARAMETERS
  9851. $mode (integer) -
  9852. One of the PDFMODE_* constants.
  9853.  
  9854. $object_id (integer) -
  9855. Id of the object containing enryption parameters.
  9856.  
  9857. $object_data (string) -
  9858. Encryption parameters.
  9859.  
  9860. AUTHOR
  9861. Christian Vigh, 03/2017.
  9862.  
  9863. HISTORY
  9864. [Version : 1.0] [Date : 2017-03-14] [Author : CV]
  9865. Initial version.
  9866.  
  9867. **************************************************************************************************************/
  9868. public function __construct ( $file_id, $mode, $object_id, $object_data )
  9869. {
  9870. $this -> FileId = $file_id ;
  9871. $this -> ObjectId = $object_id ;
  9872. $this -> ObjectData = $object_data ;
  9873. $this -> Mode = $mode ;
  9874.  
  9875. // Encryption algorithm version & revision
  9876. preg_match ( '#/V \s+ (?P<value> \d+)#ix', $object_data, $algorithm_match ) ;
  9877. $this -> AlgorithmVersion = ( integer ) $algorithm_match [ 'value' ] ;
  9878.  
  9879. preg_match ( '#/R \s+ (?P<value> \d+)#ix', $object_data, $algorithm_revision_match ) ;
  9880. $this -> AlgorithmRevision = ( integer ) $algorithm_revision_match [ 'value' ] ;
  9881.  
  9882. // Encryption flags
  9883. preg_match ( '#/P \s+ (?P<value> \-? \d+)#ix', $object_data, $flags_match ) ;
  9884. $this -> Flags = ( integer) $flags_match [ 'value' ] ;
  9885.  
  9886. // Key length (40 bits, if not specified)
  9887. if ( preg_match ( '#/Length \s+ (?P<value> \d+)#ix', $object_data, $key_length_match ) )
  9888. $this -> KeyLength = $key_length_match [ 'value' ] ;
  9889. else
  9890. $this -> KeyLength = 40 ;
  9891.  
  9892. // Owner and user passwords
  9893. $this -> UserKey = $this -> GetStringParameter ( '/U', $object_data ) ;
  9894. $this -> OwnerKey = $this -> GetStringParameter ( '/O', $object_data ) ;
  9895.  
  9896. // Owner and user encryption strings
  9897. $this -> UserEncryptionString = $this -> GetStringParameter ( '/UE', $object_data ) ;
  9898. $this -> OwnerEncryptionString = $this -> GetStringParameter ( '/OE', $object_data ) ;
  9899.  
  9900. // EncryptMetadata flag
  9901. if ( preg_match ( '# /EncryptMetadata (?P<value> (true) | (1) | (false) | (0) )#imsx', $object_data, $encryption_match ) )
  9902. {
  9903. if ( ! strcasecmp ( $encryption_match [ 'value' ], 'true' ) || ! strcasecmp ( $encryption_match [ 'value' ], 'false' ) )
  9904. $this -> EncryptMetadata = true ;
  9905. else
  9906. $this -> EncryptMetadata = false ;
  9907. }
  9908. else
  9909. $this -> EncryptMetadata = false ;
  9910.  
  9911. // Now, try to determine the encryption algorithm to be used
  9912. $user_key_length = strlen ( $this -> UserKey ) ;
  9913. $owner_key_length = strlen ( $this -> OwnerKey ) ;
  9914. $user_encryption_string_length = strlen ( $this -> UserEncryptionString ) ;
  9915. $owner_encryption_string_length = strlen ( $this -> OwnerEncryptionString ) ;
  9916.  
  9917. $error_unhandled_version = false ;
  9918. $error_unhandled_revision = false ;
  9919.  
  9920. switch ( $this -> AlgorithmVersion )
  9921. {
  9922. case 1 :
  9923. switch ( $this -> AlgorithmRevision )
  9924. {
  9925. case 2 :
  9926. if ( $user_key_length != 32 && $owner_key_length != 32 )
  9927. {
  9928. if ( PdfToText::$DEBUG )
  9929. error ( new PdfToTextDecryptionException ( "Invalid user and/or owner key length ($user_key_length/$owner_key_length)", $object_id ) ) ;
  9930. }
  9931.  
  9932. $this -> EncryptionAlgorithm = self::PDFCRYPT_ALGORITHM_RC4 ;
  9933. $this -> FileKeyLength = 5 ;
  9934. break ;
  9935.  
  9936. default :
  9937. $error_unhandled_revision = true ;
  9938. }
  9939. break ;
  9940.  
  9941. default :
  9942. $error_unhandled_version = true ;
  9943. }
  9944.  
  9945. // Report unsupported versions/revisions
  9946. if ( $error_unhandled_version || $error_unhandled_revision )
  9947. {
  9948. if ( PdfToText::$DEBUG )
  9949. error ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
  9950. $object_id ) ) ;
  9951.  
  9952. $this -> UnSupportedEncryptionAlgorithm = true ;
  9953.  
  9954. return ;
  9955. }
  9956.  
  9957. // Build the object key
  9958. $this -> Decrypter = PdfDecryptionAlgorithm::GetInstance ( $this ) ;
  9959.  
  9960. if ( $this -> Decrypter === false )
  9961. {
  9962. if ( PdfToText::$DEBUG )
  9963. warning ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm #{$this -> EncryptionAlgorithm}, " .
  9964. "version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
  9965. $object_id ) ) ;
  9966.  
  9967. $this -> UnsupportedEncryptionAlgorithm = true ;
  9968.  
  9969. return ;
  9970. }
  9971. //dump ( $this ) ;
  9972. }
  9973.  
  9974.  
  9975. /*--------------------------------------------------------------------------------------------------------------
  9976.  
  9977. NAME
  9978. GetInstance - Creates an instance of a PdfEncryptionData object.
  9979.  
  9980. PROTOTYPE
  9981. $obj = PdfEncryptionData::GetInstance ( $object_id, $object_data ) ;
  9982.  
  9983. DESCRIPTION
  9984. Returns an instance of encryption data
  9985.  
  9986. *-------------------------------------------------------------------------------------------------------------*/
  9987. public static function GetInstance ( $file_id, $object_id, $object_data )
  9988. {
  9989. // Encryption mode
  9990. if ( ! preg_match ( '#/Filter \s* / (?P<mode> \w+)#ix', $object_data, $object_data_match ) )
  9991. return (false ) ;
  9992.  
  9993. switch ( strtolower ( $object_data_match [ 'mode' ] ) )
  9994. {
  9995. case 'standard' :
  9996. $mode = self::PDFMODE_STANDARD ;
  9997. break ;
  9998.  
  9999. default :
  10000. if ( self::$DEBUG > 1 )
  10001. error ( new PdfToTextDecodingException ( "Unhandled encryption mode '{$object_data [ 'mode' ]}'", $object_id ) ) ;
  10002.  
  10003. return ( false ) ;
  10004.  
  10005. }
  10006.  
  10007. // Basic checks have been performed, return an instance of encryption data
  10008. return ( new PdfEncryptionData ( $file_id, $mode, $object_id, $object_data ) ) ;
  10009. }
  10010.  
  10011.  
  10012. /*--------------------------------------------------------------------------------------------------------------
  10013.  
  10014. NAME
  10015. Decrypt - Decrypts object data.
  10016.  
  10017. PROTOTYPE
  10018. $data = $this -> Decrypt ( $object_id, $object_data ) ;
  10019.  
  10020. DESCRIPTION
  10021. Decrypts object data, when the PDF file is password-protected.
  10022.  
  10023. PARAMETERS
  10024. $object_id (integer) -
  10025. Pdf object number.
  10026.  
  10027. $object_data (string) -
  10028. Object data.
  10029.  
  10030. RETURN VALUE
  10031. Returns the decrypted object data, or false if the encrypted object could not be decrypted.
  10032.  
  10033. *-------------------------------------------------------------------------------------------------------------*/
  10034. public function Decrypt ( $object_id, $object_data )
  10035. {
  10036. if ( $this -> UnsupportedEncryptionAlgorithm )
  10037. return ( false ) ;
  10038.  
  10039. return ( false ) ;
  10040. //return ( $this -> Decrypter -> Decrypt ( $object_data ) ) ;
  10041. //return ( "BT (coucou)Tj ET" ) ;
  10042. }
  10043. }
  10044.  
  10045.  
  10046. /*==============================================================================================================
  10047.  
  10048. class PdfDecryptionAlgorithm -
  10049. Base class for algorithm decrypters.
  10050.  
  10051. ==============================================================================================================*/
  10052. abstract class PdfDecryptionAlgorithm //extends Object
  10053. {
  10054. protected $EncryptionData ;
  10055. protected $ObjectKey ;
  10056. protected $ObjectKeyBytes ;
  10057. protected $ObjectKeyLength ;
  10058.  
  10059.  
  10060. public function __construct ( $encryption_data )
  10061. {
  10062. $this -> EncryptionData = $encryption_data ;
  10063.  
  10064. $objkey = '' ;
  10065.  
  10066. for ( $i = 0 ; $i < $this -> EncryptionData -> FileKeyLength ; $i ++ )
  10067. $objkey .= $this -> EncryptionData -> FileId [$i] ;
  10068.  
  10069. $objkey .= chr ( ( $this -> EncryptionData -> ObjectId ) & 0xFF ) ;
  10070. $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 8 ) & 0xFF ) ;
  10071. $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 16 ) & 0xFF ) ;
  10072. $objkey .= chr ( 0 ) ; // obj generation number & 0xFF
  10073. $objkey .= chr ( 0 ) ; // obj generation number >> 8 & 0xFF
  10074.  
  10075. $md5 = md5 ( $objkey, true ) ;
  10076. $this -> ObjectKey = $md5 ;
  10077. $this -> ObjectKeyLength = 16 ;
  10078.  
  10079. $this -> ObjectKeyBytes = array ( ) ;
  10080.  
  10081. for ( $i = 0 ; $i < $this -> ObjectKeyLength ; $i ++ )
  10082. $this -> ObjectKeyBytes [] = ord ( $this -> ObjectKey [$i] ) ;
  10083. }
  10084.  
  10085.  
  10086. public static function GetInstance ( $encryption_data )
  10087. {
  10088. switch ( $encryption_data -> EncryptionAlgorithm )
  10089. {
  10090. case PdfEncryptionData::PDFCRYPT_ALGORITHM_RC4 :
  10091. return ( new PdfRC4DecryptionAlgorithm ( $encryption_data ) ) ;
  10092.  
  10093. default :
  10094. return ( false ) ;
  10095. }
  10096. }
  10097.  
  10098.  
  10099. abstract public function Reset ( ) ;
  10100. abstract public function Decrypt ( $data ) ;
  10101.  
  10102. }
  10103.  
  10104.  
  10105. /*==============================================================================================================
  10106.  
  10107. class PdfRC4DecryptionAlgorithm -
  10108. A decrypter class for RC4 encoding.
  10109.  
  10110. ==============================================================================================================*/
  10111. class PdfRC4DecryptionAlgorithm extends PdfDecryptionAlgorithm
  10112. {
  10113. private static $InitialState = false ;
  10114. protected $State ;
  10115.  
  10116.  
  10117. public function __construct ( $encryption_data )
  10118. {
  10119. parent::__construct ( $encryption_data ) ;
  10120.  
  10121. if ( self::$InitialState === false )
  10122. self::$InitialState = range ( 0, 255 ) ;
  10123. }
  10124.  
  10125.  
  10126. public function Reset ( )
  10127. {
  10128. $this -> State = self::$InitialState ;
  10129. $index1 =
  10130. $index2 = 0 ;
  10131.  
  10132. for ( $i = 0 ; $i < 256 ; $i ++ )
  10133. {
  10134. $index2 = ( $this -> ObjectKeyBytes [ $index1 ] + $this -> State [$i] + $index2 ) & 0xFF ;
  10135.  
  10136. // Swap elements $index2 and $i from $State
  10137. $x = $this -> State [$i] ;
  10138. $this -> State [$i] = $this -> State [ $index2 ] ;
  10139. $this -> State [ $index2 ] = $x ;
  10140.  
  10141. $index1 = ( $index1 + 1 ) % $this -> ObjectKeyLength ;
  10142. }
  10143. }
  10144.  
  10145.  
  10146. public function Decrypt ( $data )
  10147. {
  10148. $this -> Reset ( ) ;
  10149. $length = strlen ( $data ) ;
  10150. $x = 0 ;
  10151. $y = 0 ;
  10152. $result = '' ;
  10153.  
  10154. for ( $i = 0 ; $i < $length ; $i ++ )
  10155. {
  10156. $ord = ord ( $data [$i] ) ;
  10157. $x = ( $x + 1 ) & 0xFF ;
  10158. $y = ( $this -> State [$x] + $y ) & 0xFF ;
  10159.  
  10160. $tx = $this -> State [$x] ;
  10161. $ty = $this -> State [$y] ;
  10162.  
  10163. $this -> State [$x] = $ty ;
  10164. $this -> State [$y] = $tx ;
  10165.  
  10166. $new_ord = $ord ^ $this -> State [ ( $tx + $ty ) & 0xFF ] ;
  10167. $result .= chr ( $new_ord ) ;
  10168. }
  10169.  
  10170. return ( $result ) ;
  10171. }
  10172. }
  10173.  
  10174. /*
  10175. static Guchar rc4DecryptByte(Guchar *state, Guchar *x, Guchar *y, Guchar c) {
  10176. Guchar x1, y1, tx, ty;
  10177.  
  10178. x1 = *x = (*x + 1) % 256;
  10179. y1 = *y = (state[*x] + *y) % 256;
  10180. tx = state[x1];
  10181. ty = state[y1];
  10182. state[x1] = ty;
  10183. state[y1] = tx;
  10184. return c ^ state[(tx + ty) % 256];
  10185. }
  10186. */
  10187.  
  10188.  
  10189. /**************************************************************************************************************
  10190. **************************************************************************************************************
  10191. **************************************************************************************************************
  10192. ****** ******
  10193. ****** ******
  10194. ****** FORM DATA MANAGEMENT ******
  10195. ****** ******
  10196. ****** ******
  10197. **************************************************************************************************************
  10198. **************************************************************************************************************
  10199. **************************************************************************************************************/
  10200.  
  10201.  
  10202. /*==============================================================================================================
  10203.  
  10204. class PdfToTextFormDefinitions -
  10205. Analyzes a template XML file that describes PDF form data and maps PDF field names to human-readable
  10206. names.
  10207. The GetFormData() returns an object containing the mapped properties with their respective values.
  10208.  
  10209. ==============================================================================================================*/
  10210. class PdftoTextFormDefinitions // extends Object
  10211. implements ArrayAccess, Countable, IteratorAggregate
  10212. {
  10213. static private $ClassDefinitionCount = 0 ;
  10214.  
  10215. // Class name, as specified in the XML template
  10216. protected $ClassName ;
  10217. // Form definitions (a template may contain several versions of the same for definition)
  10218. protected $Definitions ;
  10219. // Form definitions coming from the PDF file
  10220. protected $PdfDefinitions ;
  10221.  
  10222.  
  10223. /*--------------------------------------------------------------------------------------------------------------
  10224.  
  10225. Constructor -
  10226. Parses the supplied XML template.
  10227.  
  10228. *-------------------------------------------------------------------------------------------------------------*/
  10229. public function __construct ( $xml_data, $pdf_xml_data )
  10230. {
  10231. // Get PDF XML form data definitions
  10232. $this -> __get_pdf_form_definitions ( $pdf_xml_data ) ;
  10233.  
  10234. // Create XML data from scratch, if none specified
  10235. if ( ! $xml_data )
  10236. $xml_data = $this -> __create_default_xml_data ( $this -> PdfDefinitions ) ;
  10237.  
  10238. // Decode XML the hard way, without XSD
  10239. $xml = simplexml_load_string ( $xml_data ) ;
  10240. $root_entry = $xml -> getName ( ) ;
  10241. $definitions = array ( ) ;
  10242. $class_name = "PdfFormData" ;
  10243.  
  10244. if ( strcasecmp ( $root_entry, "forms" ) )
  10245. error ( new PdfToTextFormException ( "Root entry must be <forms>, <$root_entry> was found." ) ) ;
  10246.  
  10247. // Get the attribute values of the <forms> tag
  10248. foreach ( $xml -> attributes ( ) as $attribute_name => $attribute_value )
  10249. {
  10250. switch ( strtolower ( $attribute_name ) )
  10251. {
  10252. case 'class' :
  10253. $class_name = ( string ) $attribute_value ;
  10254.  
  10255. if ( class_exists ( $class_name, false ) )
  10256. error ( new PdfToTextFormException ( "Class \"$class_name\" specified in XML template already exists." ) ) ;
  10257.  
  10258. break ;
  10259.  
  10260. default :
  10261. error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <forms> tag." ) ) ;
  10262. }
  10263. }
  10264.  
  10265. // Don't know if it will be useful, but try to avoid class name collisions by appending a sequential number if necessary
  10266. if ( class_exists ( $class_name, false ) )
  10267. {
  10268. self::$ClassDefinitionCount ++ ;
  10269. $class_name .= '_' . self::$ClassDefinitionCount ;
  10270. }
  10271.  
  10272. // Loop through each child <form> entry
  10273. foreach ( $xml -> children ( ) as $child )
  10274. {
  10275. $child_name = $child -> getName ( ) ;
  10276.  
  10277. switch ( strtolower ( $child_name ) )
  10278. {
  10279. case 'form' :
  10280. $definitions [] = new PdfToTextFormDefinition ( $class_name, $child, $this -> PdfDefinitions ) ;
  10281. break ;
  10282.  
  10283. default :
  10284. error ( new PdfToTextFormException ( "Invalid tag <$child_name>." ) ) ;
  10285. }
  10286. }
  10287.  
  10288. // Ensure that there is at least one form definition
  10289. if ( ! count ( $definitions ) )
  10290. error ( new PdfToTextFormException ( "No <form> definition found." ) ) ;
  10291.  
  10292. // Save to properties
  10293. $this -> ClassName = $class_name ;
  10294. $this -> Definitions = $definitions ;
  10295. }
  10296.  
  10297.  
  10298.  
  10299.  
  10300. /*--------------------------------------------------------------------------------------------------------------
  10301.  
  10302. Internal methods.
  10303.  
  10304. *-------------------------------------------------------------------------------------------------------------*/
  10305.  
  10306. // __get_pdf_form_definitions -
  10307. // Retrieves the form field definitions coming from the PDF file.
  10308. private function __get_pdf_form_definitions ( $pdf_data )
  10309. {
  10310. preg_match_all ( '#(?P<field> <field .*? </field \s* >)#imsx', $pdf_data, $matches ) ;
  10311.  
  10312. foreach ( $matches [ 'field' ] as $field )
  10313. {
  10314. $xml_field = simplexml_load_string ( $field ) ;
  10315.  
  10316. foreach ( $xml_field -> attributes ( ) as $attribute_name => $attribute_value )
  10317. {
  10318. switch ( strtolower ( $attribute_name ) )
  10319. {
  10320. case 'name' :
  10321. $field_name = ( string ) $attribute_value ;
  10322.  
  10323. if ( isset ( $this -> PdfDefinitions [ $field_name ] ) )
  10324. $this -> PdfDefinitions [ $field_name ] [ 'occurrences' ] ++ ;
  10325. else
  10326. {
  10327. $this -> PdfDefinitions [ $field_name ] = array
  10328. (
  10329. 'name' => $field_name,
  10330. 'occurrences' => 1
  10331. ) ;
  10332. }
  10333.  
  10334. break ;
  10335. }
  10336. }
  10337. }
  10338. }
  10339.  
  10340.  
  10341. // __create_default_xml_data -
  10342. // When no XML template has been specified, creates a default one based of the form definitions located in the PDF file.
  10343. private function __create_default_xml_data ( $pdf_definitions )
  10344. {
  10345. $result = "<forms>" . PHP_EOL .
  10346. "\t<form version=\"1.0\">" . PHP_EOL ;
  10347.  
  10348. foreach ( $pdf_definitions as $name => $field )
  10349. {
  10350. $name = str_replace ( '-', '_', $name ) ; // Just in case of
  10351. $result .= "\t\t<field name=\"$name\" form-field=\"$name\" type=\"string\"/>" . PHP_EOL ;
  10352. }
  10353.  
  10354. $result .= "\t</form>" . PHP_EOL .
  10355. "</forms>" . PHP_EOL ;
  10356.  
  10357. return ( $result ) ;
  10358. }
  10359.  
  10360.  
  10361. /*--------------------------------------------------------------------------------------------------------------
  10362.  
  10363. Interfaces implementations to retrieve form definitions.
  10364.  
  10365. *-------------------------------------------------------------------------------------------------------------*/
  10366. public function count ( )
  10367. { return ( count ( $this - Definitions ) ) ; }
  10368.  
  10369.  
  10370. public function getIterator ( )
  10371. { return ( new ArrayIterator ( $this -> Definitions ) ) ; }
  10372.  
  10373.  
  10374. public function offsetExists ( $offset )
  10375. { return ( $offset >= 0 && $offset < count ( $this -> Definitions ) ) ; }
  10376.  
  10377.  
  10378. public function offsetGet ( $offset )
  10379. { return ( $this -> Definitions [ $offset ] ) ; }
  10380.  
  10381.  
  10382. public function offsetSet ( $offset, $value )
  10383. { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
  10384.  
  10385.  
  10386. public function offsetunset ( $offset )
  10387. { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
  10388. }
  10389.  
  10390.  
  10391. /*==============================================================================================================
  10392.  
  10393. class PdfToTextFormDefinition -
  10394. Holds the description of a form inside a form XML template.
  10395.  
  10396. ==============================================================================================================*/
  10397. class PdfToTextFormDefinition // extends Object
  10398. {
  10399. // Class of the object returned by GetFormData( )
  10400. public $ClassName ;
  10401.  
  10402. // Form version
  10403. public $Version ;
  10404.  
  10405. // Field definitions
  10406. public $FieldDefinitions = array ( ) ;
  10407.  
  10408. // Field groups (ie, fields that are the results of the concatenation of several form fields)
  10409. public $Groups = array ( ) ;
  10410.  
  10411. // Pdf field definitions
  10412. public $PdfDefinitions ;
  10413.  
  10414. // Class definition in PHP, whose instance will be returned by GetFormData()
  10415. private $ClassDefinition = false ;
  10416.  
  10417. // Direct access to field definitions either through their template name or PDF name
  10418. private $FieldDefinitionsByName = array ( ) ;
  10419. private $FieldDefinitionsByPdfName = array ( ) ;
  10420.  
  10421.  
  10422. /*--------------------------------------------------------------------------------------------------------------
  10423.  
  10424. Constructor -
  10425. Analyze the contents of an XML template form definition.
  10426.  
  10427. *-------------------------------------------------------------------------------------------------------------*/
  10428. public function __construct ( $class_name, $form_definition, $pdf_definitions )
  10429. {
  10430. $this -> ClassName = $class_name ;
  10431. $this -> PdfDefinitions = $pdf_definitions ;
  10432. $field_count = 0 ;
  10433.  
  10434. // Get <form> tag attributes
  10435. foreach ( $form_definition -> attributes ( ) as $attribute_name => $attribute_value )
  10436. {
  10437. switch ( strtolower ( $attribute_name ) )
  10438. {
  10439. case 'version' :
  10440. $this -> Version = ( string ) $attribute_value ;
  10441. break ;
  10442.  
  10443. default :
  10444. error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <form> tag." ) ) ;
  10445. }
  10446. }
  10447.  
  10448. // Loop through subtags
  10449. foreach ( $form_definition -> children ( ) as $child )
  10450. {
  10451. $tag_name = $child -> getName ( ) ;
  10452.  
  10453. // Check subtags
  10454. switch ( strtolower ( $tag_name ) )
  10455. {
  10456. // <group> :
  10457. // A group is used to create a property that is the concatenation of several existing properties.
  10458. case 'group' :
  10459. $fields = array ( ) ;
  10460. $separator = '' ;
  10461. $name = false ;
  10462.  
  10463. // Loop through attribute names
  10464. foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value )
  10465. {
  10466. switch ( $attribute_name )
  10467. {
  10468. // "name" attribute" :
  10469. // The name of the property, as it will appear in the output object.
  10470. case 'name' :
  10471. $name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
  10472. break ;
  10473.  
  10474. // "separator" attribute :
  10475. // Separator to be used when concatenating the underlying properties.
  10476. case 'separator' :
  10477. $separator = ( string ) $attribute_value ;
  10478. break ;
  10479.  
  10480. // "fields" :
  10481. // A list of comma-separated field names, whose values will be concatenated together
  10482. // using the specified separator.
  10483. case 'fields' :
  10484. $items = explode ( ',', ( string ) $attribute_value ) ;
  10485.  
  10486. if ( ! count ( $items ) )
  10487. error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
  10488.  
  10489. foreach ( $items as $item )
  10490. $fields [] = PdfToTextFormDefinition::ValidatePhpName ( $item ) ;
  10491.  
  10492. break ;
  10493.  
  10494. // Other attribute names : not allowed
  10495. default :
  10496. error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <group> tag." ) ) ;
  10497. }
  10498. }
  10499.  
  10500. // Check that at least one field has been specified
  10501. if ( ! count ( $fields ) )
  10502. error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
  10503.  
  10504. // Check that the mandatory property name has been specified
  10505. if ( ! $name )
  10506. error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory in <group> tag." ) ) ;
  10507.  
  10508. // Add this new grouped property to the list of existing groups
  10509. $this -> Groups [] = array
  10510. (
  10511. 'name' => $name,
  10512. 'separator' => $separator,
  10513. 'fields' => $fields
  10514. ) ;
  10515.  
  10516. break ;
  10517.  
  10518. // <field> :
  10519. // Field definition.
  10520. case 'field' :
  10521. $field_def = new PdfToTextFormFieldDefinition ( $child ) ;
  10522. $this -> FieldDefinitions [] = $field_def ;
  10523. $this -> FieldDefinitionsByName [ $field_def -> Name ] =
  10524. $this -> FieldDefinitionsByPdfName [ $field_def -> PdfName ] = $field_count ;
  10525. $field_count ++ ;
  10526. break ;
  10527.  
  10528. // Don't allow other attribute names
  10529. default :
  10530. error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <form> definition." ) ) ;
  10531. }
  10532. }
  10533.  
  10534. // Check that everything is ok (ie, that there is no duplicate fields)
  10535. $this -> __paranoid_checks ( ) ;
  10536. }
  10537.  
  10538.  
  10539. public function ValidatePhpName ( $name )
  10540. {
  10541. $name = trim ( $name ) ;
  10542.  
  10543. if ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) )
  10544. error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ;
  10545.  
  10546. return ( $name ) ;
  10547. }
  10548.  
  10549. /*--------------------------------------------------------------------------------------------------------------
  10550.  
  10551. NAME
  10552. GetClassDefinition - Returns the class definition for the urrent form.
  10553.  
  10554. PROTOTYPE
  10555. $def = $form_def -> GetClassDefinition ( ) ;
  10556.  
  10557. DESCRIPTION
  10558. Returns a string containing the PHP class definition that will contain the properties defined in the XML
  10559. form template.
  10560.  
  10561. RETURN VALUE
  10562. Returns a string containing the PHP class definition for the current form.
  10563.  
  10564. *-------------------------------------------------------------------------------------------------------------*/
  10565. public function GetClassDefinition ( )
  10566. {
  10567. // Return the existing definition, if this method has been called more than once
  10568. if ( $this -> ClassDefinition )
  10569. return ( $this -> ClassDefinition ) ;
  10570.  
  10571. $class_def = "// Class " . $this -> ClassName . " : " . $this -> Version . PHP_EOL .
  10572. "class {$this -> ClassName}\t\textends PdfToTextFormData" . PHP_EOL .
  10573. " {" . PHP_EOL ;
  10574.  
  10575. // Get the maximum width of constant and field names
  10576. $max_width = 0 ;
  10577.  
  10578. foreach ( $this -> FieldDefinitions as $def )
  10579. {
  10580. $length1 = strlen ( $def -> Name ) ;
  10581. $length2 = strlen ( $def -> PdfName ) ;
  10582.  
  10583. if ( $length1 > $max_width || $length2 > $max_width )
  10584. $max_width = max ( $length1, $length2 ) ;
  10585.  
  10586. foreach ( $def -> Constants as $constant )
  10587. {
  10588. $length = strlen ( $constant [ 'name' ] ) ;
  10589.  
  10590. if ( $length > $max_width )
  10591. $max_width = $length ;
  10592. }
  10593. }
  10594.  
  10595. // First, write out the constant definitions
  10596. $all_constants = array ( ) ;
  10597.  
  10598. foreach ( $this -> FieldDefinitions as $def )
  10599. {
  10600. foreach ( $def -> Constants as $constant )
  10601. {
  10602. $name = $constant [ 'name' ] ;
  10603. $value = $constant [ 'value' ] ;
  10604.  
  10605. if ( isset ( $all_constants [ $name ] ) )
  10606. {
  10607. if ( $all_constants [ $name ] != $value )
  10608. error ( new PdfToTextFormException ( "Constant \"$name\" is defined more than once with different values." ) ) ;
  10609. }
  10610. else
  10611. {
  10612. $all_constants [ $name ] = $value ;
  10613.  
  10614. if ( ! is_numeric ( $value ) )
  10615. $value = '"' . addslashes ( $value ) . '"' ;
  10616.  
  10617. $class_def .= "\tconst\t" . str_pad ( $name, $max_width, " ", STR_PAD_RIGHT ) . "\t = $value ; " . PHP_EOL ;
  10618. }
  10619. }
  10620. }
  10621.  
  10622. $class_def .= PHP_EOL . PHP_EOL ;
  10623.  
  10624. // Then write property definitions
  10625. foreach ( $this -> FieldDefinitions as $def )
  10626. {
  10627. $class_def .= "\t/** @formdata */" . PHP_EOL .
  10628. "\tprotected\t\t\${$def -> Name} ;" . PHP_EOL ;
  10629. }
  10630.  
  10631. $class_def .= PHP_EOL . PHP_EOL ;
  10632.  
  10633. // And finally, grouped properties
  10634. foreach ( $this -> Groups as $group )
  10635. {
  10636. $class_def .= "\t/**" . PHP_EOL .
  10637. "\t\t@formdata" . PHP_EOL .
  10638. "\t\t@group(" . implode ( ',', $group [ 'fields' ] ) . ')' . PHP_EOL .
  10639. "\t\t@separator(" . str_replace ( ')', '\)', $group [ 'separator' ] ) . ')' . PHP_EOL .
  10640. "\t */" . PHP_EOL .
  10641. "\tprotected\t\t\${$group [ 'name' ]} ;" . PHP_EOL .PHP_EOL ;
  10642. }
  10643.  
  10644. // Constructor
  10645. $class_def .= PHP_EOL . PHP_EOL .
  10646. "\t// Class constructor" . PHP_EOL .
  10647. "\tpublic function __construct ( )" . PHP_EOL .
  10648. "\t {" . PHP_EOL .
  10649. "\t\tparent::__construct ( ) ;" . PHP_EOL .
  10650. "\t }" . PHP_EOL ;
  10651.  
  10652. $class_def .= " }" . PHP_EOL ;
  10653.  
  10654. // Save the definition, if a second call occurs
  10655. $this -> ClassDefinition = $class_def ;
  10656.  
  10657. // All done, return
  10658. return ( $class_def ) ;
  10659. }
  10660.  
  10661.  
  10662. /*--------------------------------------------------------------------------------------------------------------
  10663.  
  10664. NAME
  10665. GetFormData - Returns a form data object containing properties mapped to the form data.
  10666.  
  10667. PROTOTYPE
  10668. $object = $form_def -> GetFormData ( $fields ) ;
  10669.  
  10670. DESCRIPTION
  10671. Returns an object containing properties mapped to actual form data.
  10672.  
  10673. PARAMETERS
  10674. $fields (array) -
  10675. An associative array whoses keys are the PDF form field names, and values their values as stored
  10676. in the PDF file.
  10677.  
  10678. RETURN VALUE
  10679. Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
  10680. class constructor.
  10681.  
  10682. *-------------------------------------------------------------------------------------------------------------*/
  10683. public function GetFormData ( $fields = array ( ) )
  10684. {
  10685. if ( ! class_exists ( $this -> ClassName, false ) )
  10686. {
  10687. $class_def = $this -> GetClassDefinition ( ) ;
  10688. eval ( $class_def ) ;
  10689. }
  10690.  
  10691. $class_name = $this -> ClassName ;
  10692. $object = new $class_name ( ) ;
  10693.  
  10694. foreach ( $fields as $name => $value )
  10695. {
  10696. if ( isset ( $this -> FieldDefinitionsByPdfName [ $name ] ) )
  10697. {
  10698. $property = $this -> FieldDefinitions [ $this -> FieldDefinitionsByPdfName [ $name ] ] -> Name ;
  10699. $object -> $property = $this -> __process_field_value ( $value ) ;
  10700. }
  10701. }
  10702.  
  10703. return ( $object ) ;
  10704. }
  10705.  
  10706.  
  10707. // __process_field_values -
  10708. // Translates html entities and removes carriage returns (which are apparently used for multiline field) to
  10709. // replace them with newlines.
  10710. private function __process_field_value ( $value )
  10711. {
  10712. $value = html_entity_decode ( $value ) ;
  10713. $result = '' ;
  10714.  
  10715. for ( $i = 0, $length = strlen ( $value ) ; $i < $length ; $i ++ )
  10716. {
  10717. if ( $value [$i] !== "\r" )
  10718. $result .= $value [$i] ;
  10719. else
  10720. {
  10721. if ( isset ( $value [ $i + 1 ] ) )
  10722. {
  10723. if ( $value [ $i + 1 ] !== "\n" )
  10724. $result .= "\n" ;
  10725. }
  10726. else
  10727. $result .= "\n" ;
  10728. }
  10729. }
  10730.  
  10731. return ( $result ) ;
  10732. }
  10733.  
  10734.  
  10735. /*--------------------------------------------------------------------------------------------------------------
  10736.  
  10737. NAME
  10738. GetformDataFromPdfObject - Same as GetFormData(), except that it operates on XML data.
  10739.  
  10740. PROTOTYPE
  10741. $object = $pdf -> GetFormDataFromPdfObject ( $pdf_data ) ;
  10742.  
  10743. DESCRIPTION
  10744. Behaves the same as GetFormData(), except that it takes as input the XML contents of a PDF object.
  10745.  
  10746. PARAMETERS
  10747. $pdf_data (string) -
  10748. XML data coming from the PDF file.
  10749.  
  10750. RETURN VALUE
  10751. Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
  10752. class constructor.
  10753.  
  10754. *-------------------------------------------------------------------------------------------------------------*/
  10755. public function GetFormDataFromPdfObject ( $pdf_data )
  10756. {
  10757. // simplexml_ functions do not like tags that contain a colon - replace them with a dash
  10758. $pdf_data = preg_replace ( '/(<[^:]+?)(:)/', '$1-', $pdf_data ) ;
  10759.  
  10760. // Load the xml data
  10761. $xml = simplexml_load_string ( $pdf_data ) ;
  10762.  
  10763. // Get the form field values
  10764. $fields = array ( ) ;
  10765.  
  10766. $this -> __get_pdfform_data ( $fields, $xml ) ;
  10767.  
  10768. // Return the object
  10769. return ( $this -> GetFormData ( $fields ) ) ;
  10770. }
  10771.  
  10772.  
  10773. // __getpdfform_data -
  10774. // Retrieve the form field values from the specified PDF object, specified as XML
  10775. private function __get_pdfform_data ( &$fields, $xml )
  10776. {
  10777. $tag_name = $xml -> getName ( ) ;
  10778.  
  10779. if ( isset ( $this -> PdfDefinitions [ $tag_name ] ) )
  10780. $fields [ $tag_name ] = ( string ) $xml ;
  10781. else
  10782. {
  10783. foreach ( $xml -> children ( ) as $child )
  10784. {
  10785. $this -> __get_pdfform_data ( $fields, $child ) ;
  10786. }
  10787. }
  10788. }
  10789.  
  10790.  
  10791. // __paranoid_checks -
  10792. // Checks for several kinds of inconsistencies in the supplied XML template.
  10793. private function __paranoid_checks ( )
  10794. {
  10795. // Check that field names, PDF field names and constant names are unique
  10796. $names = array ( ) ;
  10797. $pdf_names = array ( ) ;
  10798. $constant_names = array ( ) ;
  10799.  
  10800. foreach ( $this -> FieldDefinitions as $def )
  10801. {
  10802. if ( ! isset ( $this -> PdfDefinitions [ $def -> PdfName ] ) )
  10803. error ( new PdfToTextFormException ( "Field \"{$def -> PdfName}\" is not defined in the PDF file." ) ) ;
  10804.  
  10805. if ( isset ( $names [ $def -> Name ] ) )
  10806. error ( new PdfToTextFormException ( "Field \"{$def -> Name}\" is defined more than once." ) ) ;
  10807.  
  10808. $names [ $def -> Name ] = true ;
  10809.  
  10810. if ( isset ( $pdf_names [ $def -> PdfName ] ) )
  10811. error ( new PdfToTextFormException ( "PDF Field \"{$def -> PdfName}\" is referenced more than once." ) ) ;
  10812.  
  10813. $pdf_names [ $def -> PdfName ] = true ;
  10814.  
  10815. foreach ( $def -> Constants as $constant )
  10816. {
  10817. $constant_name = $constant [ 'name' ] ;
  10818.  
  10819. if ( isset ( $constant_names [ $constant_name ] ) && $constant_names [ $constant_name ] != $constant [ 'value' ] )
  10820. error ( new PdfToTextFormException ( "Constant \"$constant_name\" is defined more than once with different values." ) ) ;
  10821.  
  10822. $constant_names [ $constant_name ] = $constant [ 'value' ] ;
  10823. }
  10824. }
  10825.  
  10826. // Check that group names are unique and that the fields they are referencing exist
  10827. $group_names = array ( ) ;
  10828.  
  10829. foreach ( $this -> Groups as $group )
  10830. {
  10831. if ( isset ( $group_names [ $group [ 'name' ] ] ) )
  10832. error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" is defined more than once." ) ) ;
  10833.  
  10834. if ( isset ( $names [ $group [ 'name' ] ] ) )
  10835. error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" has the same name as an existing field." ) ) ;
  10836.  
  10837. foreach ( $group [ 'fields' ] as $field_name )
  10838. {
  10839. if ( ! isset ( $names [ $field_name ] ) )
  10840. error ( new PdfToTextFormException ( "Field \"$field_name\" of group \"{$group [ 'name' ]}\" does not exist." ) ) ;
  10841. }
  10842. }
  10843. }
  10844. }
  10845.  
  10846.  
  10847. /*==============================================================================================================
  10848.  
  10849. class PdfToTextFormFieldDefinition -
  10850. Contains an XML template form field definition.
  10851.  
  10852. ==============================================================================================================*/
  10853. class PdfToTextFormFieldDefinition // extends Object
  10854. {
  10855. // Supported field types
  10856. const TYPE_STRING = 1 ; // String
  10857. const TYPE_CHOICE = 2 ; // Choice (must have <constant> subtags)
  10858.  
  10859. // Official name (as it will appear in the class based on the XML template)
  10860. public $Name = false ;
  10861. // Field name, as specified in the input PDF file
  10862. public $PdfName = false ;
  10863. // Field type
  10864. public $Type = self::TYPE_STRING ;
  10865. // Available constant values for this field when the "type" attribute has the value "choice"
  10866. public $Constants = array ( ) ;
  10867.  
  10868.  
  10869. /*--------------------------------------------------------------------------------------------------------------
  10870.  
  10871. Constructor -
  10872. Builds the field definition object.
  10873.  
  10874. *-------------------------------------------------------------------------------------------------------------*/
  10875. public function __construct ( $field_node )
  10876. {
  10877. // Loop through attributes
  10878. foreach ( $field_node -> attributes ( ) as $attribute_name => $attribute_value )
  10879. {
  10880. switch ( strtolower ( $attribute_name ) )
  10881. {
  10882. // "name" attribute :
  10883. // Specifies the field name as it will appear in the output class. Must be a valid PHP name.
  10884. case 'name' :
  10885. $this -> Name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
  10886. break ;
  10887.  
  10888. // "form-field" attribute :
  10889. // Corresponding field name in the input PDF form.
  10890. case 'form-field' :
  10891. $this -> PdfName = ( string ) $attribute_value ;
  10892. break ;
  10893.  
  10894. // "type" :
  10895. // Field type. Can be either :
  10896. // - "string" :
  10897. // The field value can be any type of string.
  10898. // - "choice" :
  10899. // The field value has one of the values defined by the <case> or <default> subtags.
  10900. case 'type' :
  10901. switch ( strtolower ( ( string ) $attribute_value ) )
  10902. {
  10903. case 'string' :
  10904. $this -> Type = self::TYPE_STRING ;
  10905. break ;
  10906.  
  10907. case 'choice' :
  10908. $this -> Type = self::TYPE_CHOICE ;
  10909. break ;
  10910.  
  10911. default :
  10912. error ( new PdfToTextFormException ( "Invalid value \"$attribute_value\" for the \"$attribute_name\" attribute of the <field> tag." ) ) ;
  10913. }
  10914. }
  10915. }
  10916.  
  10917. // The "name" and "form-field" attributes are mandatory
  10918. if ( ! $this -> Name )
  10919. error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory for the <field> tag." ) ) ;
  10920.  
  10921. if ( ! $this -> PdfName )
  10922. error ( new PdfToTextFormException ( "The \"form-field\" attribute is mandatory for the <field> tag." ) ) ;
  10923.  
  10924. // For "type=choice" entries, we have to look for <case> or <default> subtags
  10925. if ( $this -> Type === self::TYPE_CHOICE )
  10926. {
  10927. foreach ( $field_node -> children ( ) as $child )
  10928. {
  10929. $tag_name = $child -> getName ( ) ;
  10930. $lcname = strtolower ( $tag_name ) ;
  10931. $is_default = false ;
  10932.  
  10933. switch ( $lcname )
  10934. {
  10935. // Default value to be used when no PDF field value matches the defined constants
  10936. case 'default' :
  10937. $is_default = true ;
  10938.  
  10939. // "case" attribute :
  10940. // Maps a value to constant name that will be defined in the generated class.
  10941. case 'case' :
  10942. $constant_value = "" ;
  10943. $constant_name = false ;
  10944.  
  10945. // Retrieve attributes
  10946. foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value )
  10947. {
  10948. switch ( strtolower ( $attribute_name ) )
  10949. {
  10950. // "value" attribute :
  10951. // PDF form field value.
  10952. case 'value' :
  10953. $constant_value = ( string ) $attribute_value ;
  10954. break ;
  10955.  
  10956. // "constant" attribute :
  10957. // Associated constant.
  10958. case 'constant' :
  10959. $constant_name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
  10960. break ;
  10961.  
  10962. // Bail out if any unrecognized attribute has been specified
  10963. default :
  10964. error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
  10965. }
  10966. }
  10967.  
  10968. // Each <case> entry must have a "constant" attribute
  10969. if ( $constant_value === false && ! $is_default )
  10970. error ( new PdfToTextFormException ( "Missing constant value in <case> tag." ) ) ;
  10971.  
  10972. if ( $constant_name === false )
  10973. error ( new PdfToTextFormException ( "Attribute \"constant-name\" is required for <$tag_name> tag." ) ) ;
  10974.  
  10975. // Add this to the list of existing constants
  10976. $this -> Constants [] = array
  10977. (
  10978. 'name' => $constant_name,
  10979. 'value' => $constant_value,
  10980. 'default' => $is_default
  10981. ) ;
  10982.  
  10983. break ;
  10984.  
  10985. // Check for unrecognized tags
  10986. default :
  10987. error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
  10988. }
  10989. }
  10990. }
  10991. }
  10992. }
  10993.  
  10994.  
  10995. /*==============================================================================================================
  10996.  
  10997. class PdfToTextFormData -
  10998. Base class for all Pdf form templates data.
  10999.  
  11000. ==============================================================================================================*/
  11001. class PdfToTextFormData // extends Object
  11002. {
  11003. // Doc comments provide information about form data fields (mainly to handle grouped field values)
  11004. // The $__Properties array gives information about the form data fields themselves
  11005. private $__Properties = array ( ) ;
  11006.  
  11007.  
  11008. /*--------------------------------------------------------------------------------------------------------------
  11009.  
  11010. Constructor -
  11011. Retrieve information about the derived class properties, which are specified by the derived class
  11012. generated on the fly.
  11013.  
  11014. *-------------------------------------------------------------------------------------------------------------*/
  11015. public function __construct ( )
  11016. {
  11017. // Get class properties
  11018. $reflection = new ReflectionClass ( $this ) ;
  11019. $properties = $reflection -> getProperties ( ) ;
  11020.  
  11021. // Loop through class properties
  11022. foreach ( $properties as $property )
  11023. {
  11024. $propname = $property -> getName ( ) ;
  11025. $doc_comment = $property -> getDocComment ( ) ;
  11026.  
  11027. $fields = false ;
  11028. $separator = false ;
  11029.  
  11030. // A doc comment may indicate either :
  11031. // - A form data field (@formdata)
  11032. // - A grouped field ; in this case, we will have the following tags :
  11033. // . @formdata
  11034. // . @group(field_list) : list of fields grouped for this property
  11035. // . @separator(string) : a separator used when catenating grouped fields
  11036. if ( $doc_comment )
  11037. {
  11038. // The @formdata tag must be present
  11039. if ( strpos ( $doc_comment, '@formdata' ) === false )
  11040. continue ;
  11041.  
  11042. // @group(fields) pattern
  11043. if ( preg_match ( '/group \s* \( \s* (?P<fields> [^)]+) \)/imsx', $doc_comment, $match ) )
  11044. {
  11045. $items = explode ( ',', $match [ 'fields' ] ) ;
  11046. $fields = array ( ) ;
  11047.  
  11048. foreach ( $items as $item )
  11049. $fields [] = $item ;
  11050. }
  11051.  
  11052. // @separator(string) pattern
  11053. if ( preg_match ( '/separator \s* \( \s* (?P<separator> ( (\\\)) | (.) )+ \) /imsx', $doc_comment, $match ) )
  11054. {
  11055. $separator = stripslashes ( $match [ 'separator' ]) ;
  11056. }
  11057. }
  11058. // Ignore non-formdata properties
  11059. else
  11060. continue ;
  11061.  
  11062. // Property belongs to the form - add it to the list of available properties
  11063. $this -> __Properties [ $propname ] = array
  11064. (
  11065. 'name' => $propname,
  11066. 'fields' => $fields,
  11067. 'separator' => $separator
  11068. ) ;
  11069. }
  11070. }
  11071.  
  11072.  
  11073. /*--------------------------------------------------------------------------------------------------------------
  11074.  
  11075. __get -
  11076. Returns the underlying property value for this PDF data field.
  11077. *-------------------------------------------------------------------------------------------------------------*/
  11078. public function __get ( $member )
  11079. {
  11080. if ( ! isset ( $this -> __Properties [ $member ] ) )
  11081. warning ( new PdfToTextFormException ( "Undefined property \"$member\"." ) ) ;
  11082.  
  11083. return ( $this -> $member ) ;
  11084. }
  11085.  
  11086.  
  11087. /*--------------------------------------------------------------------------------------------------------------
  11088.  
  11089. __set -
  11090. Sets the underlying property value for this PDF data field.
  11091. When the property is a compound one, sets individual members as well.
  11092.  
  11093. *-------------------------------------------------------------------------------------------------------------*/
  11094. public function __set ( $member, $value )
  11095. {
  11096. // Property exists : some special processing will be needed
  11097. if ( isset ( $this -> __Properties [ $member ] ) )
  11098. {
  11099. $prop_entry = $this -> __Properties [ $member ] ;
  11100.  
  11101. // Non-compound property
  11102. if ( ! $prop_entry [ 'fields' ] )
  11103. {
  11104. $this -> $member = $value ;
  11105.  
  11106. // However, we have to check that this property belongs to a compound property and change
  11107. // the compound property valu accordingly
  11108. foreach ( $this -> __Properties as $name => $property )
  11109. {
  11110. if ( $property [ 'fields' ] )
  11111. {
  11112. if ( in_array ( $member, $property [ 'fields' ] ) )
  11113. {
  11114. $values = array ( ) ;
  11115.  
  11116. foreach ( $property [ 'fields' ] as $value )
  11117. $values [] = $this -> $value ;
  11118.  
  11119. // Change compound property value accordingly, using the specified separator
  11120. $this -> $name = implode ( $property [ 'separator' ], $values ) ;
  11121. }
  11122. }
  11123. }
  11124. }
  11125. // Compound property : we will have to explode it in separate parts, using the compound property separator,
  11126. // then set individual property values
  11127. else
  11128. {
  11129. $values = explode ( $prop_entry [ 'separator' ], $value ) ;
  11130. $value_count = count ( $values ) ;
  11131. $field_count = count ( $prop_entry [ 'fields' ] ) ;
  11132.  
  11133. if ( $value_count < $field_count )
  11134. error ( new PdfToTextFormException ( "Not enough value parts specified for the \"$member\" property ($value)." ) ) ;
  11135. else if ( $value_count > $field_count )
  11136. error ( new PdfToTextFormException ( "Too much value parts specified for the \"$member\" property ($value)." ) ) ;
  11137.  
  11138. $this -> $member = $value ;
  11139.  
  11140. for ( $i = 0 ; $i < $value_count ; $i ++ )
  11141. {
  11142. $sub_member = $prop_entry [ 'fields' ] [$i] ;
  11143. $this -> $sub_member = $values [$i] ;
  11144. }
  11145. }
  11146. }
  11147. // Property does not exist : let PHP act as the default way
  11148. else
  11149. $this -> $member = $value ;
  11150. }
  11151. }
  11152.  
  11153.  
  11154. /**************************************************************************************************************
  11155. **************************************************************************************************************
  11156. **************************************************************************************************************
  11157. ****** ******
  11158. ****** ******
  11159. ****** CAPTURE DEFINITION MANAGEMENT ******
  11160. ****** (none of the classes listed here are meant to be instantiated outside this file) ******
  11161. ****** ******
  11162. ****** ******
  11163. **************************************************************************************************************
  11164. **************************************************************************************************************
  11165. **************************************************************************************************************/
  11166.  
  11167. /*==============================================================================================================
  11168.  
  11169. class PdfToTextCaptureDefinitions -
  11170. Holds text capture definitions, whose XML data has been supplied to the PdfToText::SetCapture() method.
  11171.  
  11172. ==============================================================================================================*/
  11173. class PdfToTextCaptureDefinitions // extends Object
  11174. implements ArrayAccess, Countable, Iterator
  11175. {
  11176. // Shape definitions - The actual objects populating this array depend on the definitions supplied
  11177. // (rectangle, etc.)
  11178. protected $ShapeDefinitions = array ( ) ;
  11179.  
  11180. // Shape field names - used for iteration
  11181. private $ShapeNames ;
  11182.  
  11183. // Page count
  11184. private $PageCount = false ;
  11185.  
  11186.  
  11187. /*--------------------------------------------------------------------------------------------------------------
  11188.  
  11189. CONSTRUCTOR -
  11190. Analyzes the XML data defining the areas to be captured.
  11191.  
  11192. *-------------------------------------------------------------------------------------------------------------*/
  11193. public function __construct ( $xml_data )
  11194. {
  11195. $xml = simplexml_load_string ( $xml_data ) ;
  11196. $root_entry = $xml -> getName ( ) ;
  11197.  
  11198. // Root tag must be <captures>
  11199. if ( strcasecmp ( $root_entry, "captures" ) )
  11200. error ( new PdfToTextCaptureException ( "Root entry must be <captures>, <$root_entry> was found." ) ) ;
  11201.  
  11202. // Process the child nodes
  11203. foreach ( $xml -> children ( ) as $child )
  11204. {
  11205. $tag_name = $child -> getName ( ) ;
  11206.  
  11207. switch ( strtolower ( $tag_name ) )
  11208. {
  11209. // <rectangle> :
  11210. // An rectangle whose dimensions are given in the <page> subtags.
  11211. case 'rectangle' :
  11212. $shape_object = new PdfToTextCaptureRectangleDefinition ( $child ) ;
  11213. break ;
  11214.  
  11215. // <columns> :
  11216. // A definition of columns and their applicable pages.
  11217. case 'lines' :
  11218. $shape_object = new PdfToTextCaptureLinesDefinition ( $child ) ;
  11219. break ;
  11220.  
  11221. // Complain if an unknown tag is found
  11222. default :
  11223. error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <captures>." ) ) ;
  11224. }
  11225.  
  11226. // Shape names must be unique within the definitinos
  11227. if ( isset ( $this -> ShapeDefinitions [ $shape_object -> Name ] ) )
  11228. error ( new PdfToTextCaptureLinesDefinition ( "The shape named \"{$shape_object -> Name}\" has been defined more than once." ) ) ;
  11229. else
  11230. $this -> ShapeDefinitions [ $shape_object -> Name ] = $shape_object ;
  11231. }
  11232.  
  11233. // Build an array of shape names for the iterator interface
  11234. $this -> ShapeNames = array_keys ( $this -> ShapeDefinitions ) ;
  11235. }
  11236.  
  11237.  
  11238. /*--------------------------------------------------------------------------------------------------------------
  11239.  
  11240. NAME
  11241. GetCapturedObject - Creates an object reflecting the captured data.
  11242.  
  11243. PROTOTYPE
  11244. $captures = $capture_definitions -> GetCapturedObject ( $document_fragments ) ;
  11245.  
  11246. DESCRIPTION
  11247. Returns an object of type PdfToTextCapturedData,containing the data that has been captured, based on
  11248. the capture definitions.
  11249.  
  11250. PARAMETERS
  11251. $document_fragments (type) -
  11252. Document text fragments collected during the text layout rendering process.
  11253.  
  11254. RETURN VALUE
  11255. An object of type PdfToTextCaptures, cntaining the captured data.
  11256.  
  11257. *-------------------------------------------------------------------------------------------------------------*/
  11258. public function GetCapturedObject ( $document_fragments )
  11259. {
  11260. $captures = array ( ) ;
  11261.  
  11262. foreach ( $this -> ShapeDefinitions as $shape )
  11263. {
  11264. $capture = $shape -> ExtractAreas ( $document_fragments ) ;
  11265.  
  11266. foreach ( $capture as $page => $items )
  11267. {
  11268. $captures [ $page ] [] = $items ;
  11269. }
  11270. }
  11271.  
  11272. $captured_object = new PdfToTextCaptures ( $captures ) ;
  11273.  
  11274. return ( $captured_object ) ;
  11275. }
  11276.  
  11277.  
  11278. /*--------------------------------------------------------------------------------------------------------------
  11279.  
  11280. NAME
  11281. SetPageCount - Defines the total number of pages in the document.
  11282.  
  11283. PROTOTYPE
  11284. $shape -> SetPageCount ( $count ) ;
  11285.  
  11286. DESCRIPTION
  11287. At the time when XML definitions are processed, the total number of pages in the document is not yet
  11288. known. Moreover, page ranges or page numbers can be expressed relative to the last page of the
  11289. document (for example : 1..$-1, which means "from the first page to the last page - 1).
  11290. Setting the page count once it is known allows to process the expressions specified in the "number"
  11291. attribute of the <pages> tag so that the expressions are transformed into actual page numbers.
  11292.  
  11293. PARAMETERS
  11294. $count (integer) -
  11295. Number of pages in the document.
  11296.  
  11297. *-------------------------------------------------------------------------------------------------------------*/
  11298. public function SetPageCount ( $count )
  11299. {
  11300. $this -> PageCount = $count ;
  11301.  
  11302. foreach ( $this -> ShapeDefinitions as $def )
  11303. {
  11304. $def -> SetPageCount ( $count ) ;
  11305. }
  11306. }
  11307.  
  11308.  
  11309. /*--------------------------------------------------------------------------------------------------------------
  11310.  
  11311. NAME
  11312. GetNodeAttributes - Retrieves an XML node's attributes.
  11313.  
  11314. PROTOTYPE
  11315. $result = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
  11316.  
  11317. DESCRIPTION
  11318. Retrieves the attributes defined for the specified XML node.
  11319.  
  11320. PARAMETERS
  11321. $node (SimpleXMLElement) -
  11322. Node whose attributes are to be extracted.
  11323.  
  11324. $attributes (associative array) -
  11325. Associative array whose keys are the attribute names and whose values define a boolean
  11326. indicating whether the attribute is mandatory or not.
  11327.  
  11328. RETURN VALUE
  11329. Returns an associative whose key are the attribute names and whose values are the attribute values,
  11330. specified as a string.
  11331. For optional unspecified attributes, the value will be boolean false.
  11332.  
  11333. NOTES
  11334. The method throws an exception if the node contains an unknown attribute, or if a mandatory attribute
  11335. is missing.
  11336.  
  11337. *-------------------------------------------------------------------------------------------------------------*/
  11338. public static function GetNodeAttributes ( $node, $attributes )
  11339. {
  11340. $tag_name = $node -> getName ( ) ;
  11341.  
  11342. // Build the initial value for the resulting array
  11343. $result = array ( ) ;
  11344.  
  11345. foreach ( array_keys ( $attributes ) as $name )
  11346. $result [ $name ] = false ;
  11347.  
  11348. // Loop through node attributes
  11349. foreach ( $node -> attributes ( ) as $attribute_name => $attribute_value )
  11350. {
  11351. $attribute_name = strtolower ( $attribute_name ) ;
  11352.  
  11353. // Check that the attributes exists ; if yes, add it to the resulting array
  11354. if ( isset ( $attributes [ $attribute_name ] ) )
  11355. $result [ $attribute_name ] = ( string ) $attribute_value ;
  11356. // Otherwise, throw an exception
  11357. else
  11358. error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
  11359. }
  11360.  
  11361. // Check that all mandatory attributes have been specified
  11362. foreach ( $attributes as $attribute_name => $mandatory )
  11363. {
  11364. if ( $mandatory && $result [ $attribute_name ] === false )
  11365. error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
  11366. }
  11367.  
  11368. // All done, return
  11369. return ( $result ) ;
  11370. }
  11371.  
  11372.  
  11373. /*--------------------------------------------------------------------------------------------------------------
  11374.  
  11375. NAME
  11376. GetBooleanAttribute - Returns a boolean value associated to a string.
  11377.  
  11378. PROTOTYPE
  11379. $bool = PdfToTextCaptureDefinitions::GetBooleanValue ( $value ) ;
  11380.  
  11381. DESCRIPTION
  11382. Returns a boolean value corresponding to a boolean specified as a string.
  11383.  
  11384. PARAMETERS
  11385. $value (string) -
  11386. A boolean value represented as a string.
  11387. The strings 'true', 'yes', 'on' and '1' will be interpreted as boolean true.
  11388. The strings 'false', 'no', 'off' and '0' will be interpreted as boolean false.
  11389.  
  11390. RETURN VALUE
  11391. The boolean value corresponding to the specified string.
  11392.  
  11393. NOTES
  11394. An exception is thrown if the supplied string is incorrect.
  11395.  
  11396. *-------------------------------------------------------------------------------------------------------------*/
  11397. public static function GetBooleanAttribute ( $value )
  11398. {
  11399. $lcvalue = strtolower ( $value ) ;
  11400.  
  11401. if ( $lcvalue === 'true' || $lcvalue === 'on' || $lcvalue === 'yes' || $lcvalue === '1' || $value === true )
  11402. return ( true ) ;
  11403. else if ( $lcvalue === 'false' || $lcvalue === 'off' || $lcvalue === 'no' || $lcvalue === '0' || $value === false )
  11404. return( false ) ;
  11405. else
  11406. error ( new PdfToTextCaptureLinesDefinition ( "Invalid boolean value \"$value\"." ) ) ;
  11407. }
  11408.  
  11409.  
  11410. /*--------------------------------------------------------------------------------------------------------------
  11411.  
  11412. Interfaces implementations.
  11413.  
  11414. *-------------------------------------------------------------------------------------------------------------*/
  11415.  
  11416. // Countable interface
  11417. public function count ( )
  11418. { return ( count ( $this -> ShapeDefinitions ) ) ; }
  11419.  
  11420.  
  11421. // ArrayAccess interface
  11422. public function offsetExists ( $offset )
  11423. { return ( isset ( $this -> ShapeDefinitions [ $offset ] ) ) ; }
  11424.  
  11425.  
  11426. public function offsetGet ( $offset )
  11427. { return ( $this -> ShapeDefinitions [ $offset ] ) ; }
  11428.  
  11429.  
  11430. public function offsetSet ( $offset, $value )
  11431. { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
  11432.  
  11433.  
  11434. public function offsetunset ( $offset )
  11435. { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
  11436.  
  11437.  
  11438. // Iterator interface -
  11439. // Iteration is made through shape names, which are supplied by the $ShapeNames property
  11440. private $__iterator_index = 0 ;
  11441.  
  11442. public function rewind ( )
  11443. { $this -> __iterator_index = 0 ; }
  11444.  
  11445. public function valid ( )
  11446. { return ( $this -> __iterator_index >= 0 && $this -> __iterator_index < count ( $this -> ShapeNames ) ) ; }
  11447.  
  11448. public function key ( )
  11449. { return ( $this -> ShapeNames [ $this -> __iterator_index ] ) ; }
  11450.  
  11451. public function next ( )
  11452. { $this -> __iterator_index ++ ; }
  11453.  
  11454. public function current ( )
  11455. { return ( $this -> ShapeDefinitions [ $this -> ShapeNames [ $this -> __iterator_index ] ] ) ; }
  11456. }
  11457.  
  11458.  
  11459. /*==============================================================================================================
  11460.  
  11461. class PdfToTextCaptureShapeDefinition -
  11462. Base class for capturing shapes.
  11463.  
  11464. ==============================================================================================================*/
  11465. abstract class PdfToTextCaptureShapeDefinition //extends Object
  11466. {
  11467. const SHAPE_RECTANGLE = 1 ;
  11468. const SHAPE_COLUMN = 2 ;
  11469. const SHAPE_LINE = 3 ;
  11470.  
  11471. // Capture name
  11472. public $Name ;
  11473. // Capture type - one of the SHAPE_* constants, assigned by derived classes.
  11474. public $Type ;
  11475. // Applicable pages for this capture
  11476. public $ApplicablePages ;
  11477. // Areas per page for this shape
  11478. public $Areas = array ( ) ;
  11479. // Separator used when multiple elements are covered by the same shape
  11480. public $Separator = " " ;
  11481.  
  11482.  
  11483. /*--------------------------------------------------------------------------------------------------------------
  11484.  
  11485. Constructor -
  11486. Initializes the base capture class.
  11487.  
  11488. *-------------------------------------------------------------------------------------------------------------*/
  11489. public function __construct ( $type )
  11490. {
  11491. $this -> Type = $type ;
  11492. $this -> ApplicablePages = new PdfToTextCaptureApplicablePages ( ) ;
  11493. }
  11494.  
  11495.  
  11496. /*--------------------------------------------------------------------------------------------------------------
  11497.  
  11498. SetPageCount -
  11499. Sets the page count, so that all the applicable pages can be determined.
  11500. Derived classes can implement this function if some additional work is needed.
  11501.  
  11502. *-------------------------------------------------------------------------------------------------------------*/
  11503. public function SetPageCount ( $count )
  11504. {
  11505. $this -> ApplicablePages -> SetPageCount ( $count ) ;
  11506. }
  11507.  
  11508.  
  11509. /*--------------------------------------------------------------------------------------------------------------
  11510.  
  11511. GetFragmentData -
  11512. Extracts data from a text fragment (text + coordinates).
  11513.  
  11514. *-------------------------------------------------------------------------------------------------------------*/
  11515. protected function GetFragmentData ( $fragment, &$text, &$left, &$top, &$right, &$bottom )
  11516. {
  11517. $left = ( double ) $fragment [ 'x' ] ;
  11518. $top = ( double ) $fragment [ 'y' ] ;
  11519. $right = $left + ( double ) $fragment [ 'width' ] - 1 ;
  11520. $bottom = $top - ( double ) $fragment [ 'font-height' ] ;
  11521. $text = $fragment [ 'text' ] ;
  11522. }
  11523.  
  11524.  
  11525. /*--------------------------------------------------------------------------------------------------------------
  11526.  
  11527. GetAttributes -
  11528. Retrieves the attributes of the given XML node. Processes the following attributes, which are common to
  11529. all shapes :
  11530. - Name
  11531. - Separator
  11532.  
  11533. *-------------------------------------------------------------------------------------------------------------*/
  11534. protected function GetAttributes ( $node, $attributes = array ( ) )
  11535. {
  11536. $attributes = array_merge ( $attributes, array ( 'name' => true, 'separator' => false ) ) ;
  11537. $shape_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
  11538. $this -> Name = $shape_attributes [ 'name' ] ;
  11539.  
  11540. if ( $shape_attributes [ 'separator' ] !== false )
  11541. $this -> Separator = PdfToText::Unescape ( $shape_attributes [ 'separator' ] ) ;
  11542.  
  11543. return ( $shape_attributes ) ;
  11544. }
  11545.  
  11546.  
  11547. /*--------------------------------------------------------------------------------------------------------------
  11548.  
  11549. ExtractAreas -
  11550. Extracts text contents from the document fragments.
  11551.  
  11552. *-------------------------------------------------------------------------------------------------------------*/
  11553. public abstract function ExtractAreas ( $document_fragments ) ;
  11554. }
  11555.  
  11556.  
  11557. /*==============================================================================================================
  11558.  
  11559. class PdfToTextCaptureRectangleDefinition -
  11560. A shape for capturing text in rectangle areas.
  11561.  
  11562. ==============================================================================================================*/
  11563. class PdfToTextCaptureRectangleDefinition extends PdfToTextCaptureShapeDefinition
  11564. {
  11565. /*--------------------------------------------------------------------------------------------------------------
  11566.  
  11567. CONSTRUCTOR -
  11568. Analyzes the contents of a <rectangle> XML node, which contains <page> child node giving the
  11569. applicable pages and the rectangle dimensions.
  11570.  
  11571. *-------------------------------------------------------------------------------------------------------------*/
  11572. public function __construct ( $node )
  11573. {
  11574. parent::__construct ( self::SHAPE_RECTANGLE ) ;
  11575.  
  11576. $this -> GetAttributes ( $node ) ;
  11577.  
  11578. // Loop through node's children
  11579. foreach ( $node -> children ( ) as $child )
  11580. {
  11581. $tag_name = $child -> getName ( ) ;
  11582.  
  11583. switch ( strtolower ( $tag_name ) )
  11584. {
  11585. // <page> tag : applicable page(s)
  11586. case 'page' :
  11587. // Retrieve the specified attributes
  11588. $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
  11589. (
  11590. $child,
  11591. array
  11592. (
  11593. 'number' => true,
  11594. 'left' => true,
  11595. 'right' => false,
  11596. 'top' => true,
  11597. 'bottom' => false,
  11598. 'width' => false,
  11599. 'height' => false
  11600. )
  11601. ) ;
  11602.  
  11603. $page_number = $page_attributes [ 'number' ] ;
  11604.  
  11605. // Add this page to the list of applicable pages for this shape
  11606. $this -> ApplicablePages -> Add ( $page_number, $page_attributes ) ;
  11607.  
  11608. break ;
  11609.  
  11610. // Other tag : throw an exception
  11611. default :
  11612. error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
  11613. }
  11614. }
  11615. }
  11616.  
  11617.  
  11618. /*--------------------------------------------------------------------------------------------------------------
  11619.  
  11620. ExtractAreas -
  11621. Extracts text contents from the document fragments.
  11622.  
  11623. *-------------------------------------------------------------------------------------------------------------*/
  11624. public function ExtractAreas ( $document_fragments )
  11625. {
  11626. $result = array ( ) ;
  11627.  
  11628. // Loop through document fragments
  11629. foreach ( $document_fragments as $page => $page_contents )
  11630. {
  11631. $fragments = $page_contents [ 'fragments' ] ;
  11632.  
  11633. // Ignore pages that are not applicable
  11634. if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
  11635. continue ;
  11636.  
  11637. // Loop through each text fragment of the page
  11638. foreach ( $fragments as $fragment )
  11639. {
  11640. $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
  11641.  
  11642. // Only handle text fragments that are within the specified area
  11643. if ( $this -> Areas [ $page ] -> Contains ( $left, $top, $right, $bottom ) )
  11644. {
  11645. // Normally, rectangle shapes are used to capture a single line...
  11646. if ( ! isset ( $result [ $page ] ) )
  11647. $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, $text, $left, $top, $right, $bottom, $this ) ;
  11648. // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <rectangle> tag will
  11649. // be used to separate items
  11650. else
  11651. {
  11652. $existing_area = $result [ $page ] ;
  11653.  
  11654. $existing_area -> Top = max ( $existing_area -> Top , $top ) ;
  11655. $existing_area -> Bottom = min ( $existing_area -> Bottom, $bottom ) ;
  11656. $existing_area -> Left = min ( $existing_area -> Left , $left ) ;
  11657. $existing_area -> Right = max ( $existing_area -> Right , $right ) ;
  11658. $existing_area -> Text .= $this -> Separator . $text ;
  11659. }
  11660. }
  11661. }
  11662. }
  11663.  
  11664.  
  11665. // Provide empty values for pages which did not capture a rectangle shape
  11666. $added_missing_pages = false ;
  11667.  
  11668. foreach ( $this -> ApplicablePages as $page => $applicable )
  11669. {
  11670. if ( ! isset ( $result [ $page ] ) )
  11671. {
  11672. $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, '', 0, 0, 0, 0, $this ) ;
  11673. $added_missing_pages = true ;
  11674. }
  11675. }
  11676.  
  11677. if ( $added_missing_pages ) // Sort by page number if empty values were added
  11678. ksort ( $result ) ;
  11679.  
  11680. // All done, return
  11681. return ( $result ) ;
  11682. }
  11683.  
  11684.  
  11685. /*--------------------------------------------------------------------------------------------------------------
  11686.  
  11687. SetPageCount -
  11688. Ensures that an Area is created for each related page.
  11689.  
  11690. *-------------------------------------------------------------------------------------------------------------*/
  11691. public function SetPageCount ( $count )
  11692. {
  11693. parent::SetPageCount ( $count ) ;
  11694.  
  11695. // Create a rectangle area for each page concerned - this can only be done when the number of pages is known
  11696. // (and the ApplicablePages object updated accordingly)
  11697. foreach ( $this -> ApplicablePages -> ExtraPageMapData as $page => $data )
  11698. $this -> Areas [ $page ] = new PdfToTextCaptureArea ( $data ) ;
  11699. }
  11700. }
  11701.  
  11702.  
  11703. /*==============================================================================================================
  11704.  
  11705. class PdfToTextCaptureLinesDefinition -
  11706. A shape for capturing text in rectangle areas.
  11707.  
  11708. ==============================================================================================================*/
  11709. class PdfToTextCaptureLinesDefinition extends PdfToTextCaptureShapeDefinition
  11710. {
  11711. // Column areas
  11712. public $Columns = array ( ) ;
  11713. // Top and bottom lines
  11714. public $Tops = array ( ) ;
  11715. public $Bottoms = array ( ) ;
  11716. // Column names
  11717. private $ColumnNames = array ( ) ;
  11718.  
  11719.  
  11720. /*--------------------------------------------------------------------------------------------------------------
  11721.  
  11722. CONSTRUCTOR -
  11723. Analyzes the contents of a <columns> XML node, which contains <page> nodes giving a part of the column
  11724. dimensions, and <column> nodes which specify the name of the column and the remaining coordinates,
  11725. such as "left" or "width"
  11726.  
  11727. *-------------------------------------------------------------------------------------------------------------*/
  11728. public function __construct ( $node )
  11729. {
  11730. parent::__construct ( self::SHAPE_COLUMN ) ;
  11731.  
  11732. $shape_attributes = $this -> GetAttributes ( $node, array ( 'default' => false ) ) ;
  11733. $column_default = ( $shape_attributes [ 'default' ] ) ? $shape_attributes [ 'default' ] : '' ;
  11734.  
  11735. // Loop through node's children
  11736. foreach ( $node -> children ( ) as $child )
  11737. {
  11738. $tag_name = $child -> getName ( ) ;
  11739.  
  11740. switch ( strtolower ( $tag_name ) )
  11741. {
  11742. // <page> tag
  11743. case 'page' :
  11744. // Retrieve the specified attributes
  11745. $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
  11746. (
  11747. $child,
  11748. array
  11749. (
  11750. 'number' => true,
  11751. 'top' => true,
  11752. 'height' => true,
  11753. 'bottom' => false
  11754. )
  11755. ) ;
  11756.  
  11757. // We have to store the y-coordinate of the first and last lines, to determine until which
  11758. // position we have to check for column contents.
  11759. // The "top" and "bottom" attributes of the <page> tag actually determine the top and bottom
  11760. // y-coordinates where to search for columns. However, we will have to rename the "bottom"
  11761. // attribute to "column-bottom", in order for it not to be mistaken with actual column rectangle
  11762. // (only the "height" attribute of the <page> tag gives the height of a line)
  11763. $page_attributes [ 'column-top' ] = $page_attributes [ 'top' ] ;
  11764. $page_attributes [ 'column-bottom' ] = ( double ) $page_attributes [ 'bottom' ] ;
  11765. unset ( $page_attributes [ 'bottom' ] ) ;
  11766.  
  11767. // Add this page to the list of applicable pages for this shape
  11768. $this -> ApplicablePages -> Add ( $page_attributes [ 'number' ], $page_attributes ) ;
  11769.  
  11770. break ;
  11771.  
  11772. // <column> tag :
  11773. case 'column' :
  11774. $column_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes
  11775. (
  11776. $child,
  11777. array
  11778. (
  11779. 'name' => true,
  11780. 'left' => false,
  11781. 'right' => false,
  11782. 'width' => false,
  11783. 'default' => false
  11784. )
  11785. ) ;
  11786.  
  11787. $column_name = $column_attributes [ 'name' ] ;
  11788.  
  11789. // Build the final default value, if any one is specified ; the following special constructs are processed :
  11790. // - "%c" :
  11791. // Replaced by the column name.
  11792. // - "%n" :
  11793. // Replaced by the column index (starting from zero).
  11794. if ( ! $column_attributes [ 'default' ] )
  11795. $column_attributes [ 'default' ] = $column_default ;
  11796.  
  11797. $substitutes = array
  11798. (
  11799. '%c' => $column_name,
  11800. '%n' => count ( $this -> Columns )
  11801. ) ;
  11802.  
  11803. $column_attributes [ 'default' ] = str_replace
  11804. (
  11805. array_keys ( $substitutes ),
  11806. array_values ( $substitutes ),
  11807. $column_attributes [ 'default' ]
  11808. ) ;
  11809.  
  11810. // Add the column definition to this object
  11811. if ( ! isset ( $this -> Columns [ $column_name ] ) )
  11812. {
  11813. $this -> Columns [ $column_attributes [ 'name' ] ] = $column_attributes ;
  11814. $this -> ColumnNames [] = $column_attributes [ 'name' ] ;
  11815. }
  11816. else
  11817. error ( new PdfToTextCaptureException ( "Column \"$column_name\" is defined more than once." ) ) ;
  11818.  
  11819. break ;
  11820.  
  11821. // Other tag : throw an exception
  11822. default :
  11823. error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
  11824. }
  11825. }
  11826. }
  11827.  
  11828.  
  11829. /*--------------------------------------------------------------------------------------------------------------
  11830.  
  11831. ExtractAreas -
  11832. Extracts text contents from the document fragments.
  11833.  
  11834. *-------------------------------------------------------------------------------------------------------------*/
  11835. public function ExtractAreas ( $document_fragments )
  11836. {
  11837. $result = array ( ) ;
  11838.  
  11839. // Loop through each page of document fragments
  11840. foreach ( $document_fragments as $page => $page_contents )
  11841. {
  11842. $fragments = $page_contents [ 'fragments' ] ;
  11843.  
  11844. // Ignore this page if not included in the <columns> definition
  11845. if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
  11846. continue ;
  11847.  
  11848. // <columns> definition only gives the location of the first line of each column, together
  11849. // with its height.
  11850. // We will build as many new column areas as can fit on one page
  11851. $this_page_areas = $this -> Areas [ $page ] ;
  11852. $column_areas = array ( ) ;
  11853.  
  11854. for ( $i = 0, $count = count ( $this_page_areas ) ; $i < $count ; $i ++ )
  11855. {
  11856. // For now, duplicate the existing column areas - they will represent the 1st line of columns
  11857. $this_page_area = $this_page_areas [$i] ;
  11858. $new_area = clone ( $this_page_area ) ;
  11859. $column_areas [0] [] = $new_area ;
  11860. $line_height = $new_area -> Height ;
  11861. $current_top = $new_area -> Top - $line_height ;
  11862. $current_line = 0 ;
  11863.  
  11864. // Then build new column areas for each successive lines
  11865. while ( $current_top - $line_height >= 0 )
  11866. {
  11867. $current_line ++ ;
  11868. $new_area = clone ( $new_area ) ;
  11869. $new_area -> Top -= $line_height ;
  11870. $new_area -> Bottom -= $line_height ;
  11871.  
  11872. $column_areas [ $current_line ] [] = $new_area ;
  11873. $current_top -= $line_height ;
  11874. }
  11875. }
  11876.  
  11877. // Now extract the columns, line per line, from the current page's text fragments
  11878. $found_lines = array ( ) ;
  11879.  
  11880. foreach ( $fragments as $fragment )
  11881. {
  11882. $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
  11883.  
  11884. // Loop through each line of column areas, built from the above step
  11885. foreach ( $column_areas as $line => $column_areas_per_name )
  11886. {
  11887. $index = 0 ; // Column index
  11888.  
  11889. // Process each column area
  11890. foreach ( $column_areas_per_name as $column_area )
  11891. {
  11892. // ... but only do something if the current column area is contained in the current fragment
  11893. if ( $column_area -> Contains ( $left, $top, $right, $bottom ) )
  11894. {
  11895. // The normal usage will be to capture one-line columns...
  11896. if ( ! isset ( $found_lines [ $line ] [ $column_area -> Name ] ) )
  11897. {
  11898. $found_lines [ $line ] [ $column_area -> Name ] =
  11899. new PdfToTextCapturedColumn ( $page, $column_area -> Name, $text,
  11900. $left, $top, $right, $bottom, $this ) ;
  11901. }
  11902. // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <lines> or
  11903. // <column> tag will be used to separate items
  11904. else
  11905. {
  11906. $existing_area = $found_lines [ $line ] [ $column_area -> Name ] ;
  11907.  
  11908. $existing_area -> Top = max ( $existing_area -> Top , $column_area -> Top ) ;
  11909. $existing_area -> Bottom = min ( $existing_area -> Bottom, $column_area -> Bottom ) ;
  11910. $existing_area -> Left = min ( $existing_area -> Left , $column_area -> Left ) ;
  11911. $existing_area -> Right = max ( $existing_area -> Right , $column_area -> Right ) ;
  11912. $existing_area -> Text .= $this -> Separator . $text ;
  11913. }
  11914. }
  11915.  
  11916. $index ++ ;
  11917. }
  11918. }
  11919. }
  11920.  
  11921. // A final pass to provide default values for empty columns (usually, column values that are not represented in the PDF file)
  11922. // Also get the surrounding box for the whole line
  11923. $final_lines = array ( ) ;
  11924.  
  11925. foreach ( $found_lines as $line => $columns_line )
  11926. {
  11927. foreach ( $this -> ColumnNames as $column_name )
  11928. {
  11929. if ( ! isset ( $columns_line [ $column_name ] ) )
  11930. {
  11931. $columns_line [ $column_name ] =
  11932. new PdfToTextCapturedColumn ( $page, $column_name, $this -> Columns [ $column_name ] [ 'default' ], 0, 0, 0, 0, $this ) ;
  11933. }
  11934. }
  11935.  
  11936. // Get the (left,top) coordinates of the line
  11937. $line_left = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Left ;
  11938. $line_top = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Top ;
  11939.  
  11940. // Get the (right,bottom) coordinates - we have to find the last column whose value is not a default value
  11941. // (and therefore, has a non-zero Right coordinate)
  11942. $last = count ( $this -> ColumnNames ) - 1 ;
  11943. $line_right = 0 ;
  11944. $line_bottom = 0 ;
  11945.  
  11946. while ( $last >= 0 && ! $columns_line [ $this -> ColumnNames [ $last ] ] -> Right )
  11947. $last -- ;
  11948.  
  11949. if ( $last > 0 )
  11950. {
  11951. $line_right = $columns_line [ $this -> ColumnNames [ $last ] ] -> Right ;
  11952. $line_bottom = $columns_line [ $this -> ColumnNames [ $last ] ] -> Bottom ;
  11953. }
  11954.  
  11955. // Create a CaptureLine entry
  11956. $final_lines [] = new PdfToTextCapturedLine ( $page, $this -> Name, $columns_line, $line_left, $line_top, $line_right, $line_bottom, $this ) ;
  11957. }
  11958.  
  11959. // The result for this page will be a CapturedLines object
  11960. $result [ $page ] = new PdfToTextCapturedLines ( $this -> Name, $page, $final_lines ) ;
  11961. }
  11962.  
  11963. // All done, return
  11964. return ( $result ) ;
  11965. }
  11966.  
  11967.  
  11968. /*--------------------------------------------------------------------------------------------------------------
  11969.  
  11970. SetPageCount -
  11971. Extracts text contents from the document fragments.
  11972.  
  11973. *-------------------------------------------------------------------------------------------------------------*/
  11974. public function SetPageCount ( $count )
  11975. {
  11976. parent::SetPageCount ( $count ) ;
  11977.  
  11978. foreach ( $this -> ApplicablePages as $page => $applicable )
  11979. {
  11980. if ( ! $applicable )
  11981. continue ;
  11982.  
  11983. foreach ( $this -> Columns as $column )
  11984. {
  11985. if ( ! isset ( $this -> Tops [ $page ] ) )
  11986. {
  11987. $this -> Tops [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-top' ] ;
  11988. $this -> Bottoms [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-bottom' ] ;
  11989. }
  11990.  
  11991. $area = new PdfToTextCaptureArea ( $column, $this -> ApplicablePages -> ExtraPageMapData [ $page ], $column [ 'name' ] ) ;
  11992.  
  11993. $this -> Areas [ $page ] [] = $area ;
  11994. }
  11995. }
  11996. }
  11997.  
  11998.  
  11999. /*--------------------------------------------------------------------------------------------------------------
  12000.  
  12001. Support functions.
  12002.  
  12003. *-------------------------------------------------------------------------------------------------------------*/
  12004. }
  12005.  
  12006.  
  12007. /*==============================================================================================================
  12008.  
  12009. class PdfToTextCaptureApplicablePages -
  12010. Holds a list of applicable pages given by the "number" attribute of <page> tags.
  12011.  
  12012. ==============================================================================================================*/
  12013. class PdfToTextCaptureApplicablePages //extends Object
  12014. implements ArrayAccess, Countable, Iterator
  12015. {
  12016. // Ranges of pages, as given by the "number" attribute of the <page> tag. Since a page number expression
  12017. // can refer to the last page ("$"), and the total number of pages in the document is not yet known at the
  12018. // time of object instantiation, we have to store all the page ranges as is.
  12019. protected $PageRanges = array ( ) ;
  12020.  
  12021. // Once the SetPageCount() method has been called (ie, once the total number of pages in the document is
  12022. // known), then a PageMap is built ; each key is the page number, indicating whether the page applies or not.
  12023. public $PageMap = array ( ) ;
  12024.  
  12025. // Extra data associated, this time, with each page in PageMap
  12026. public $ExtraPageMapData = array ( ) ;
  12027.  
  12028. // Page count - set by the SetPageCount() method
  12029. public $PageCount = false ;
  12030.  
  12031.  
  12032. /*--------------------------------------------------------------------------------------------------------------
  12033.  
  12034. CONSTRUCTOR
  12035. Initializes the object.
  12036.  
  12037. *-------------------------------------------------------------------------------------------------------------*/
  12038. public function __construct ( )
  12039. {
  12040. }
  12041.  
  12042.  
  12043. /*--------------------------------------------------------------------------------------------------------------
  12044.  
  12045. NAME
  12046. Add - Add a page number(s) definition.
  12047.  
  12048. PROTOTYPE
  12049. $applicable_pages -> Add ( $page_number ) ;
  12050.  
  12051. DESCRIPTION
  12052. Add the page number(s) specified by the "number" attribute of the <pages> tag to the list of applicable
  12053. pages.
  12054.  
  12055. PARAMETERS
  12056. $page_number (string) -
  12057. A string defining which pages are applicable. This can be a single page number :
  12058.  
  12059. <page number="1" .../>
  12060.  
  12061. or a comma-separated list of pages :
  12062.  
  12063. <page number="1, 2, 10" .../>
  12064.  
  12065. or range(s) of pages :
  12066.  
  12067. <page number="1..10, 12..20" .../>
  12068.  
  12069. The special "$" character means "last page" ; thus the following example :
  12070.  
  12071. <page number="1, $-9..$" .../>
  12072.  
  12073. means : "applicable pages are 1, plus the last ten pages f the document".
  12074.  
  12075. *-------------------------------------------------------------------------------------------------------------*/
  12076. public function Add ( $page_number, $extra_data = false )
  12077. {
  12078. $this -> __parse_page_numbers ( $page_number, $extra_data ) ;
  12079. }
  12080.  
  12081.  
  12082. /*--------------------------------------------------------------------------------------------------------------
  12083.  
  12084. NAME
  12085. SetPageCount - Sets the total number of pages in the document.
  12086.  
  12087. PROTOTYPE
  12088. $applicable_pages -> SetPageCount ( $count ) ;
  12089.  
  12090. DESCRIPTION
  12091. Sets the total number of pages in the document and builds a map of which pages are applicable or not.
  12092.  
  12093. PARAMETERS
  12094. $count (integer) -
  12095. Total number of pages in the document.
  12096.  
  12097. *-------------------------------------------------------------------------------------------------------------*/
  12098. public function SetPageCount ( $count )
  12099. {
  12100. $this -> PageCount = $count ;
  12101. $this -> PageMap = array ( ) ;
  12102.  
  12103. // Loop through the page ranges - every single value in the ranges has been converted to an integer ;
  12104. // the other ones, built as expressions (using "$" for example) are processed here to give the actual
  12105. // page number
  12106. foreach ( $this -> PageRanges as $range )
  12107. {
  12108. $low = $range [0] ;
  12109. $high = $range [1] ;
  12110.  
  12111. // Translate expression to an actual value for the low and high parts of the range, if not already integers
  12112. if ( ! is_integer ( $low ) )
  12113. $low = $this -> __check_expression ( $low, $count ) ;
  12114.  
  12115. if ( ! is_integer ( $high ) )
  12116. $high = $this -> __check_expression ( $high, $count ) ;
  12117.  
  12118. // Expressions using "$" may lead to negative values - adjust them
  12119. if ( $low < 1 )
  12120. {
  12121. if ( $high < 1 )
  12122. $high = 1 ;
  12123.  
  12124. $low = 1 ;
  12125. }
  12126.  
  12127. // Check that the range is consistent
  12128. if ( $low > $high )
  12129. error ( new PdfToTextCaptureException ( "Low value ($low) must be less or equal to high value ($high) " .
  12130. "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
  12131.  
  12132. // Ignore ranges where the 'low' value is higher than the number of pages in the document
  12133. if ( $low > $count )
  12134. {
  12135. warning ( new PdfToTextCaptureException ( "Low value ($low) is greater than page count ($count) " .
  12136. "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
  12137. continue ;
  12138. }
  12139.  
  12140. // Normalize the 'high' value, so that it's not bigger than the number of pages in the document
  12141. if ( $high > $count )
  12142. $high = $count ;
  12143.  
  12144. // Complement the page map using this range
  12145. for ( $i = $low ; $i <= $high ; $i ++ )
  12146. {
  12147. $this -> PageMap [$i] = true ;
  12148. $this -> ExtraPageMapData [$i] = $range [2] ;
  12149. }
  12150. }
  12151. }
  12152.  
  12153.  
  12154. /*--------------------------------------------------------------------------------------------------------------
  12155.  
  12156. Interfaces implementations.
  12157.  
  12158. *-------------------------------------------------------------------------------------------------------------*/
  12159.  
  12160. // Countable interface
  12161. public function count ( )
  12162. { return ( count ( $this -> PageMap ) ) ; }
  12163.  
  12164.  
  12165. // Array access interface
  12166. public function offsetExists ( $offset )
  12167. { return ( isset ( $this -> PageMap [ $offset ] ) ) ; }
  12168.  
  12169.  
  12170. public function offsetGet ( $offset )
  12171. { return ( ( isset ( $this -> PageMap [ $offset ] ) ) ? true : false ) ; }
  12172.  
  12173.  
  12174. public function offsetSet ( $offset, $value )
  12175. { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
  12176.  
  12177.  
  12178. public function offsetunset ( $offset )
  12179. { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
  12180.  
  12181.  
  12182. // Iterator interface
  12183. private $__iterator_value = 1 ;
  12184.  
  12185. public function rewind ( )
  12186. { $this -> __iterator_value = 1 ; }
  12187.  
  12188.  
  12189. public function valid ( )
  12190. { return ( $this -> __iterator_value >= 1 && $this -> __iterator_value <= $this -> PageCount ) ; }
  12191.  
  12192.  
  12193. public function key ( )
  12194. { return ( $this -> __iterator_value ) ; }
  12195.  
  12196.  
  12197. public function next ( )
  12198. { $this -> __iterator_value ++ ; }
  12199.  
  12200.  
  12201. public function current ( )
  12202. { return ( ( isset ( $this -> PageMap [ $this -> __iterator_value ] ) ) ? true : false ) ; }
  12203.  
  12204.  
  12205. /*--------------------------------------------------------------------------------------------------------------
  12206.  
  12207. Helper functions.
  12208.  
  12209. *-------------------------------------------------------------------------------------------------------------*/
  12210.  
  12211. // __parse_page_numbers -
  12212. // Performs a first pass on the value of the "number" attribute of the <page> tag. Transforms range expressions
  12213. // when possible to integers ; keep the expression string intact when either the low or high value of a range
  12214. // is itself an expression, probably using the "$" (page count) character.
  12215. private function __parse_page_numbers ( $text, $extra_data )
  12216. {
  12217. $ranges = explode ( ',', $text ) ;
  12218.  
  12219. // Loop through comma-separated ranges
  12220. foreach ( $ranges as $range )
  12221. {
  12222. $items = explode ( '..', $range ) ;
  12223.  
  12224. // Check if current item is a range
  12225. switch ( count ( $items ) )
  12226. {
  12227. // If not a range (ie, a single value) then make a range using that value
  12228. // (low and high range values will be the same)
  12229. case 1 :
  12230. if ( is_numeric ( $items [0] ) )
  12231. $low = $high = ( integer ) $items [0] ;
  12232. else
  12233. $low = $high = trim ( $items [0] ) ;
  12234.  
  12235. break ;
  12236.  
  12237. // If range, store the low and high values
  12238. case 2 :
  12239. $low = ( is_numeric ( $items [0] ) ) ? ( integer ) $items [0] : trim ( $items [0] ) ;
  12240. $high = ( is_numeric ( $items [1] ) ) ? ( integer ) $items [1] : trim ( $items [1] ) ;
  12241. break ;
  12242.  
  12243. // Other cases : throw an exception
  12244. default :
  12245. error ( new PdfToTextCaptureException ( "Invalid page range specification \"$range\"." ) ) ;
  12246. }
  12247.  
  12248. // If the low or high range value is an expression, check at this stage that it is correct
  12249. if ( is_string ( $low ) && $this -> __check_expression ( $low ) === false )
  12250. error ( new PdfToTextCaptureException ( "Invalid expression \"$low\" in page range specification \"$range\"." ) ) ;
  12251.  
  12252. if ( is_string ( $high ) && $this -> __check_expression ( $high ) === false )
  12253. error ( new PdfToTextCaptureException ( "Invalid expression \"$high\" in page range specification \"$range\"." ) ) ;
  12254.  
  12255. // Add the page range and the extra data
  12256. $this -> PageRanges [] = array ( $low, $high, $extra_data ) ;
  12257. }
  12258. }
  12259.  
  12260.  
  12261. // __check_expression -
  12262. // Checks that a syntactically correct
  12263. private function __check_expression ( $str, $count = 1 )
  12264. {
  12265. $new_str = str_replace ( '$', $count, $str ) ;
  12266. $value = @eval ( "return ( $new_str ) ;" ) ;
  12267.  
  12268. return ( $value ) ;
  12269. }
  12270. }
  12271.  
  12272.  
  12273. /*==============================================================================================================
  12274.  
  12275. class PdfToTextCaptureArea -
  12276. A capture area describes a rectangle, either by its top, left, right and bottom coordinates, or by
  12277. its top/left coordinates, and its width and height.
  12278.  
  12279. ==============================================================================================================*/
  12280. class PdfToTextCaptureArea //extends Object
  12281. {
  12282. // List of authorzed keyword for defining the rectangle dimensions
  12283. static private $Keys = array ( 'left', 'top', 'right', 'bottom', 'width', 'height' ) ;
  12284.  
  12285. // Rectangle dimensions
  12286. private $Left = false,
  12287. $Top = false,
  12288. $Right = false,
  12289. $Bottom = false ;
  12290.  
  12291. // Area name (for internal purposes)
  12292. public $Name ;
  12293.  
  12294.  
  12295. /*--------------------------------------------------------------------------------------------------------------
  12296.  
  12297. NAME
  12298. Constructor
  12299.  
  12300. PROTOTYPE
  12301. $area = new PdfToTextCaptureArea ( $area, $default_area = null, $name = '' ) ;
  12302.  
  12303. DESCRIPTION
  12304. Initialize an area (a rectangle) using the supplied coordinates
  12305.  
  12306. PARAMETERS
  12307. $area (array) -
  12308. An associative array that may contain the following entries :
  12309.  
  12310. - 'left' (double) :
  12311. Left x-coordinate (mandatory).
  12312.  
  12313. - 'top' (double) :
  12314. Top y-coordinate (mandatory).
  12315.  
  12316. - 'right (double) :
  12317. Right x-coordinate.
  12318.  
  12319. - 'bottom' (double) :
  12320. Bottom y-coordinate.
  12321.  
  12322. - 'width' (double) :
  12323. Width of the rectangle, starting from 'left'.
  12324.  
  12325. - 'height' (double) :
  12326. Height of the rectangle, starting from 'top'.
  12327.  
  12328. Either the 'right' or 'width' entries must be specified. This is the same for the 'bottom' and
  12329. 'height' entries.
  12330.  
  12331. $default_area (array) -
  12332. An array that can be used to supply default values when absent from $area.
  12333.  
  12334. $name (string) -
  12335. An optional name for this area. This information is not used by the class.
  12336.  
  12337. NOTES
  12338. Coordinate (0,0) is located at the left bottom of the page.
  12339.  
  12340. *-------------------------------------------------------------------------------------------------------------*/
  12341. public function __construct ( $area, $default_area = null, $name = '' )
  12342. {
  12343. $left =
  12344. $top =
  12345. $right =
  12346. $bottom =
  12347. $width =
  12348. $height = false ;
  12349.  
  12350. // Retrieve each entry that allows to specify a coordinate component, using $default_area if needed
  12351. foreach ( self::$Keys as $key )
  12352. {
  12353. if ( isset ( $area [ $key ] ) )
  12354. {
  12355. if ( $area [ $key ] === false )
  12356. {
  12357. if ( isset ( $default_area [ $key ] ) )
  12358. $$key = $default_area [ $key ] ;
  12359. else
  12360. $$key = false ;
  12361. }
  12362. else
  12363. $$key = $area [ $key ] ;
  12364. }
  12365. else if ( isset ( $default_area [ $key ] ) )
  12366. $$key = $default_area [ $key ] ;
  12367. }
  12368.  
  12369. // Check for mandatory coordinates
  12370. if ( $left === false )
  12371. error ( new PdfToTextCaptureException ( "Attribute \"left\" is mandatory." ) );
  12372. else
  12373. $left = ( double ) $left ;
  12374.  
  12375. if ( $top === false )
  12376. error ( new PdfToTextCaptureException ( "Attribute \"top\" is mandatory." ) ) ;
  12377. else
  12378. $top = ( double ) $top ;
  12379.  
  12380. // Either the 'right' or 'width' entries are required
  12381. if ( $right === false )
  12382. {
  12383. if ( $width === false )
  12384. error ( new PdfToTextCaptureException ( "Either the \"right\" or the \"width\" attribute must be specified." ) ) ;
  12385. else
  12386. $right = $left + ( double ) $width - 1 ;
  12387. }
  12388. else
  12389. $right = ( double ) $right ;
  12390.  
  12391. // Same for 'bottom' and 'height'
  12392. if ( $bottom === false )
  12393. {
  12394. if ( $height === false )
  12395. error ( new PdfToTextCaptureException ( "Either the \"bottom\" or the \"height\" attribute must be specified." ) ) ;
  12396. else
  12397. $bottom = $top - ( double ) $height + 1 ;
  12398. }
  12399. else
  12400. $bottom = ( double ) $bottom ;
  12401.  
  12402. // All done, we have the coordinates we wanted
  12403. $this -> Left = $left ;
  12404. $this -> Right = $right ;
  12405. $this -> Top = $top ;
  12406. $this -> Bottom = $bottom ;
  12407.  
  12408. $this -> Name = $name ;
  12409. }
  12410.  
  12411.  
  12412. /*--------------------------------------------------------------------------------------------------------------
  12413.  
  12414. NAME
  12415. __get, __set - Implement the Width and Height properties.
  12416.  
  12417. *-------------------------------------------------------------------------------------------------------------*/
  12418. public function __get ( $member )
  12419. {
  12420. switch ( $member )
  12421. {
  12422. case 'Left' :
  12423. case 'Top' :
  12424. case 'Right' :
  12425. case 'Bottom' :
  12426. return ( $this -> $member ) ;
  12427.  
  12428. case 'Width' :
  12429. return ( $this -> Right - $this -> Left + 1 ) ;
  12430.  
  12431. case 'Height' :
  12432. return ( $this -> Top - $this -> Bottom + 1 ) ;
  12433.  
  12434. default :
  12435. trigger_error ( "Undefined property \"$member\"." ) ;
  12436. }
  12437. }
  12438.  
  12439.  
  12440. public function __set ( $member, $value )
  12441. {
  12442. $value = ( double ) $value ;
  12443.  
  12444. switch ( $member )
  12445. {
  12446. case 'Top' :
  12447. case 'Left' :
  12448. case 'Right' :
  12449. case 'Bottom' :
  12450. $this -> $member = $value ;
  12451. break ;
  12452.  
  12453. case 'Width' :
  12454. $this -> Right = $this -> Left + $value - 1 ;
  12455. break ;
  12456.  
  12457. case 'Height' :
  12458. $this -> Bottom = $this -> Top - $value + 1 ;
  12459. break ;
  12460.  
  12461. default :
  12462. trigger_error ( "Undefined property \"$member\"." ) ;
  12463. }
  12464. }
  12465.  
  12466.  
  12467. /*--------------------------------------------------------------------------------------------------------------
  12468.  
  12469. NAME
  12470. Contains - Check if this area contains the specified rectangle.
  12471.  
  12472. *-------------------------------------------------------------------------------------------------------------*/
  12473. public function Contains ( $left, $top, $right, $bottom )
  12474. {
  12475. if ( $left >= $this -> Left && $right <= $this -> Right &&
  12476. $top <= $this -> Top && $bottom >= $this -> Bottom )
  12477. return ( true ) ;
  12478. else
  12479. return ( false ) ;
  12480. }
  12481. }
  12482.  
  12483.  
  12484.  
  12485. /**************************************************************************************************************
  12486. **************************************************************************************************************
  12487. **************************************************************************************************************
  12488. ****** ******
  12489. ****** ******
  12490. ****** CAPTURED TEXT MANAGEMENT ******
  12491. ****** (none of the classes listed here are meant to be instantiated outside this file) ******
  12492. ****** ******
  12493. ****** ******
  12494. **************************************************************************************************************
  12495. **************************************************************************************************************
  12496. **************************************************************************************************************/
  12497.  
  12498. /*==============================================================================================================
  12499.  
  12500. class PdfToTextCapturedText -
  12501. Base class for captured text enclosed by shapes.
  12502.  
  12503. ==============================================================================================================*/
  12504. abstract class PdfToTextCapturedText //extends Object
  12505. {
  12506. // Shape name (as specified by the "name" attribute of the <rectangle> or <lines> tags, for example)
  12507. public $Name ;
  12508. // Number of the page where the text was found (starts from 1)
  12509. public $Page ;
  12510. // Shape type (one of the PfToTextCaptureShape::SHAPE_* constants)
  12511. public $Type ;
  12512. // Shape definition object (not really used, but in case of...)
  12513. private $ShapeDefinition ;
  12514. // Captured text
  12515. public $Text ;
  12516. // Surrounding rectangle in the PDF file
  12517. public $Left,
  12518. $Top,
  12519. $Right,
  12520. $Bottom ;
  12521.  
  12522.  
  12523.  
  12524. /*--------------------------------------------------------------------------------------------------------------
  12525.  
  12526. Constructor -
  12527. Initializes a captured text object, whatever the original shape.
  12528.  
  12529. *-------------------------------------------------------------------------------------------------------------*/
  12530. public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
  12531. {
  12532. $this -> Name = $name ;
  12533. $this -> Page = $page ;
  12534. $this -> ShapeDefinition = $definition ;
  12535. $this -> Text = $text ;
  12536. $this -> Left = $left ;
  12537. $this -> Top = $top ;
  12538. $this -> Right = $right ;
  12539. $this -> Bottom = $bottom ;
  12540. $this -> Type = $definition -> Type ;
  12541. }
  12542. }
  12543.  
  12544.  
  12545. /*==============================================================================================================
  12546.  
  12547. class PdfToTextCapturedRectangle -
  12548. Implements a text captured by a rectangle shape.
  12549.  
  12550. ==============================================================================================================*/
  12551. class PdfToTextCapturedRectangle extends PdfToTextCapturedText
  12552. {
  12553. public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
  12554. {
  12555. parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
  12556. }
  12557.  
  12558.  
  12559. public function __tostring ( )
  12560. { return ( $this -> Text ) ; }
  12561. }
  12562.  
  12563.  
  12564. /*==============================================================================================================
  12565.  
  12566. class PdfToTextCapturedColumn -
  12567. Implements a text captured by a lines/column shape.
  12568. Actually behaves like the PdfToTextCapturedRectangle class
  12569.  
  12570. ==============================================================================================================*/
  12571. class PdfToTextCapturedColumn extends PdfToTextCapturedText
  12572. {
  12573. public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
  12574. {
  12575. parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
  12576. }
  12577.  
  12578.  
  12579. public function __tostring ( )
  12580. { return ( $this -> Text ) ; }
  12581. }
  12582.  
  12583.  
  12584. /*==============================================================================================================
  12585.  
  12586. class PdfToTextCapturedLine -
  12587. Implements a text captured by a lines shape.
  12588.  
  12589. ==============================================================================================================*/
  12590. class PdfToTextCapturedLine extends PdfToTextCapturedText
  12591. implements ArrayAccess, Countable, IteratorAggregate
  12592. {
  12593. // Column objects
  12594. public $Columns ;
  12595. // Array of column names, to allow access by either index or column name
  12596. private $ColumnsByNames = array ( ) ;
  12597.  
  12598.  
  12599. /*--------------------------------------------------------------------------------------------------------------
  12600.  
  12601. Constructor -
  12602. Builds a Line object based on the supplied columns.
  12603. Also builds the Text property, which contains the columns text separated by the separator string
  12604. specified in the XML definition.
  12605.  
  12606. *-------------------------------------------------------------------------------------------------------------*/
  12607. public function __construct ( $page, $name, $columns, $left, $top, $right, $bottom, $definition )
  12608. {
  12609. // Although the Columns property is most likely to be used, build a text representation of the whole ine
  12610. $text = array ( ) ;
  12611. $count = 0 ;
  12612.  
  12613. foreach ( $columns as $column )
  12614. {
  12615. $text [] = $column -> Text ;
  12616. $this -> ColumnsByNames [ $column -> Name ] = $count ++ ;
  12617. }
  12618.  
  12619. // Provide this information to the parent constructor
  12620. parent::__construct ( $page, $name, implode ( $definition -> Separator, $text ), $left, $top, $right, $bottom, $definition ) ;
  12621.  
  12622. // Store the column definitions
  12623. $this -> Columns = $columns ;
  12624. }
  12625.  
  12626.  
  12627. /*--------------------------------------------------------------------------------------------------------------
  12628.  
  12629. __get -
  12630. Returns access to a column by its name.
  12631.  
  12632. *-------------------------------------------------------------------------------------------------------------*/
  12633. public function __get ( $member )
  12634. {
  12635. if ( isset ( $this -> ColumnsByNames [ $member ] ) )
  12636. return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
  12637. else
  12638. trigger_error ( "Undefined property \"$member\"." ) ;
  12639. }
  12640.  
  12641.  
  12642. /*--------------------------------------------------------------------------------------------------------------
  12643.  
  12644. Interfaces implementations.
  12645.  
  12646. *-------------------------------------------------------------------------------------------------------------*/
  12647. public function count ( )
  12648. { return ( $this -> Columns ) ; }
  12649.  
  12650.  
  12651. public function getIterator ( )
  12652. { return ( new ArrayIterator ( $this -> Columns ) ) ; }
  12653.  
  12654.  
  12655. public function offsetExists ( $offset )
  12656. {
  12657. if ( is_numeric ( $offset ) )
  12658. return ( $offset >= 0 && $offset < count ( $this -> Columns ) ) ;
  12659. else
  12660. return ( isset ( $this -> ColumnsByNames [ $offset ] ) ) ;
  12661. }
  12662.  
  12663.  
  12664. public function offsetGet ( $offset )
  12665. {
  12666. if ( is_numeric ( $offset ) )
  12667. return ( $this -> Columns [ $offset ] ) ;
  12668. else
  12669. return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
  12670. }
  12671.  
  12672.  
  12673. public function offsetSet ( $offset, $value )
  12674. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12675.  
  12676.  
  12677. public function offsetUnset ( $offset )
  12678. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12679. }
  12680.  
  12681.  
  12682. /*==============================================================================================================
  12683.  
  12684. class PdfToTextCapturedLines -
  12685. Implements a set of lines.
  12686.  
  12687. ==============================================================================================================*/
  12688. class PdfToTextCapturedLines //extends Object
  12689. implements ArrayAccess, Countable, IteratorAggregate
  12690. {
  12691. // Capture name, as specified by the "name" attribute of the <lines> tag
  12692. public $Name ;
  12693. // Page number of the capture
  12694. public $Page ;
  12695. // Captured lines
  12696. public $Lines ;
  12697. // Content type (mimics a little bit the PdfToTextCapturedText class)
  12698. public $Type = PdfToTextCaptureShapeDefinition::SHAPE_LINE ;
  12699.  
  12700.  
  12701. /*--------------------------------------------------------------------------------------------------------------
  12702.  
  12703. Constructor -
  12704. Instantiates a PdfToTextCapturedLines object.
  12705.  
  12706. *-------------------------------------------------------------------------------------------------------------*/
  12707. public function __construct ( $name, $page, $lines )
  12708. {
  12709. $this -> Name = $name ;
  12710. $this -> Page = $page ;
  12711. $this -> Lines = $lines ;
  12712. }
  12713.  
  12714.  
  12715. /*--------------------------------------------------------------------------------------------------------------
  12716.  
  12717. Interfaces implementations.
  12718.  
  12719. *-------------------------------------------------------------------------------------------------------------*/
  12720. public function count ( )
  12721. { return ( $this -> Lines ) ; }
  12722.  
  12723.  
  12724. public function getIterator ( )
  12725. { return ( new ArrayIterator ( $this -> Lines ) ) ; }
  12726.  
  12727.  
  12728. public function offsetExists ( $offset )
  12729. { return ( $offset >= 0 && $offset < count ( $this -> Lines ) ) ; }
  12730.  
  12731.  
  12732. public function offsetGet ( $offset )
  12733. { return ( $this -> Captures [ $offset ] ) ; }
  12734.  
  12735.  
  12736. public function offsetSet ( $offset, $value )
  12737. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12738.  
  12739.  
  12740. public function offsetUnset ( $offset )
  12741. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12742. }
  12743.  
  12744.  
  12745. /**************************************************************************************************************
  12746. **************************************************************************************************************
  12747. **************************************************************************************************************
  12748. ****** ******
  12749. ****** ******
  12750. ****** CAPTURE INTERFACE FOR THE DEVELOPER ******
  12751. ****** (none of the classes listed here are meant to be instantiated outside this file) ******
  12752. ****** ******
  12753. ****** ******
  12754. **************************************************************************************************************
  12755. **************************************************************************************************************
  12756. **************************************************************************************************************/
  12757.  
  12758. /*==============================================================================================================
  12759.  
  12760. class PdfToTextCaptures -
  12761. Represents all the areas in a PDF file captured by the supplied XML definitions.
  12762.  
  12763. ==============================================================================================================*/
  12764. class PdfToTextCaptures //extends Object
  12765. {
  12766. // Captured objects - May not exactly reflect the PdfToTextCapture*Shape classes
  12767. private $CapturedObjects ;
  12768. // Allows faster access by capture name
  12769. private $ObjectsByName = array ( ) ;
  12770.  
  12771.  
  12772. /*--------------------------------------------------------------------------------------------------------------
  12773.  
  12774. Constructor -
  12775. Instantiates a PdfToTextCaptures object.
  12776.  
  12777. *-------------------------------------------------------------------------------------------------------------*/
  12778. public function __construct ( $captures )
  12779. {
  12780. $this -> CapturedObjects = $captures ;
  12781.  
  12782. // Build an array of objects indexed by their names
  12783. foreach ( $captures as $page => $shapes )
  12784. {
  12785. foreach ( $shapes as $shape )
  12786. $this -> ObjectsByName [ $shape -> Name ] [] = $shape ;
  12787. }
  12788. }
  12789.  
  12790.  
  12791. /*--------------------------------------------------------------------------------------------------------------
  12792.  
  12793. ToCaptures -
  12794. Returns a simplified view of captured objects, with only name/value pairs.
  12795.  
  12796. *-------------------------------------------------------------------------------------------------------------*/
  12797. public function ToCaptures ( )
  12798. {
  12799. $result = new stdClass ( ) ;
  12800.  
  12801. foreach ( $this -> CapturedObjects as $page => $captures )
  12802. {
  12803. foreach ( $captures as $capture )
  12804. {
  12805. switch ( $capture -> Type )
  12806. {
  12807. case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
  12808. $name = $capture -> Name ;
  12809. $value = $capture -> Text ;
  12810. $result -> {$name} [ $page ] = $value ;
  12811. break ;
  12812.  
  12813. case PdfToTextCaptureShapeDefinition::SHAPE_LINE :
  12814. $name = $capture -> Name ;
  12815.  
  12816. if ( ! isset ( $result -> {$name} ) )
  12817. $result -> {$name} = array ( ) ;
  12818.  
  12819. foreach ( $capture as $line )
  12820. {
  12821. $columns = new stdClass ;
  12822.  
  12823. foreach ( $line as $column )
  12824. {
  12825. $column_name = $column -> Name ;
  12826. $column_value = $column -> Text ;
  12827. $columns -> {$column_name} = $column_value ;
  12828. }
  12829.  
  12830. $result -> {$name} [] = $columns ;
  12831. }
  12832. }
  12833. }
  12834. }
  12835.  
  12836. return ( $result ) ;
  12837. }
  12838.  
  12839.  
  12840. /*--------------------------------------------------------------------------------------------------------------
  12841.  
  12842. __get -
  12843. Retrieves the captured objects by their name, as specified in the XML definition.
  12844.  
  12845. *-------------------------------------------------------------------------------------------------------------*/
  12846. public function __get ( $member )
  12847. {
  12848. $fieldname = "__capture_{$member}__" ;
  12849.  
  12850. if ( ! isset ( $this -> $fieldname ) )
  12851. {
  12852. if ( ! isset ( $this -> ObjectsByName [ $member ] ) )
  12853. error ( new PdfToTextException ( "Undefined property \"$member\"." ) ) ;
  12854.  
  12855. $this -> $fieldname = $this -> GetCaptureInstance ( $member ) ;
  12856. }
  12857.  
  12858. return ( $this -> $fieldname ) ;
  12859. }
  12860.  
  12861.  
  12862. /*--------------------------------------------------------------------------------------------------------------
  12863.  
  12864. GetCapturedObjectsByName -
  12865. Returns an associative array of the captured shapes, indexed by their name.
  12866.  
  12867. *-------------------------------------------------------------------------------------------------------------*/
  12868. public function GetCapturedObjectsByName ( )
  12869. { return ( $this -> ObjectsByName ) ; }
  12870.  
  12871.  
  12872. /*--------------------------------------------------------------------------------------------------------------
  12873.  
  12874. GetCaptureInstance -
  12875. Returns an object inheriting from the PdfToTextCapture class, that wraps the capture results.
  12876.  
  12877. *-------------------------------------------------------------------------------------------------------------*/
  12878. protected function GetCaptureInstance ( $fieldname )
  12879. {
  12880. switch ( $this -> ObjectsByName [ $fieldname ] [0] -> Type )
  12881. {
  12882. case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
  12883. return ( new PdfToTextRectangleCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
  12884.  
  12885. case PdfToTextCaptureShapeDefinition::SHAPE_LINE :
  12886. return ( new PdfToTextLinesCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
  12887.  
  12888. default :
  12889. error ( new PdfToTextCaptureException ( "Unhandled shape type " . $this -> ObjectsByName [ $fieldname ] [0] -> Type . "." ) ) ;
  12890. }
  12891. }
  12892.  
  12893.  
  12894. }
  12895.  
  12896.  
  12897. /*==============================================================================================================
  12898.  
  12899. class PdfToTextCapture -
  12900. Base class for all capture classes accessible to the caller.
  12901.  
  12902. ==============================================================================================================*/
  12903. class PdfToTextCapture //extends Object
  12904. implements ArrayAccess, Countable, IteratorAggregate
  12905. {
  12906. protected $Captures ;
  12907.  
  12908.  
  12909. /*--------------------------------------------------------------------------------------------------------------
  12910.  
  12911. Constructor -
  12912. Instantiates a PdfToTextCapture object.
  12913.  
  12914. *-------------------------------------------------------------------------------------------------------------*/
  12915. public function __construct ( $objects )
  12916. {
  12917. //parent::__construct ( ) ;
  12918.  
  12919. $this -> Captures = $objects ;
  12920. }
  12921.  
  12922.  
  12923. /*--------------------------------------------------------------------------------------------------------------
  12924.  
  12925. Interfaces implementations.
  12926.  
  12927. *-------------------------------------------------------------------------------------------------------------*/
  12928. public function count ( )
  12929. { return ( $this -> Captures ) ; }
  12930.  
  12931.  
  12932. public function getIterator ( )
  12933. { return ( new ArrayIterator ( $this -> Captures ) ) ; }
  12934.  
  12935.  
  12936. public function offsetExists ( $offset )
  12937. { return ( $offset >= 0 && $offset < count ( $this -> Captures ) ) ; }
  12938.  
  12939.  
  12940. public function offsetGet ( $offset )
  12941. { return ( $this -> Captures [ $offset ] ) ; }
  12942.  
  12943.  
  12944. public function offsetSet ( $offset, $value )
  12945. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12946.  
  12947.  
  12948. public function offsetUnset ( $offset )
  12949. { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
  12950.  
  12951. }
  12952.  
  12953.  
  12954. /*==============================================================================================================
  12955.  
  12956. class PdfToTextLinesCapture -
  12957. Represents a lines capture, without indexation to their page number.
  12958.  
  12959. ==============================================================================================================*/
  12960. class PdfToTextLinesCapture extends PdfToTextCapture
  12961. {
  12962. /*--------------------------------------------------------------------------------------------------------------
  12963.  
  12964. Constructor -
  12965. "flattens" the supplied object list, by removing the PdfToTextCapturedLines class level, so that lines
  12966. can be iterated whatever their page number is.
  12967.  
  12968. *-------------------------------------------------------------------------------------------------------------*/
  12969. public function __construct ( $objects )
  12970. {
  12971. $new_objects = array ( ) ;
  12972.  
  12973. foreach ( $objects as $object )
  12974. {
  12975. foreach ( $object as $line )
  12976. $new_objects [] = $line ;
  12977. }
  12978.  
  12979. parent::__construct ( $new_objects ) ;
  12980. }
  12981. }
  12982.  
  12983.  
  12984. /*==============================================================================================================
  12985.  
  12986. class PdfToTextRectangleCapture -
  12987. Implements a rectangle capture, from the caller point of view.
  12988.  
  12989. ==============================================================================================================*/
  12990. class PdfToTextRectangleCapture extends PdfToTextCapture
  12991. {
  12992. /*--------------------------------------------------------------------------------------------------------------
  12993.  
  12994. Constructor -
  12995. Builds an object array indexed by page number.
  12996.  
  12997. *-------------------------------------------------------------------------------------------------------------*/
  12998. public function __construct ( $objects )
  12999. {
  13000. $new_objects = array ( ) ;
  13001.  
  13002. foreach ( $objects as $object )
  13003. $new_objects [ $object -> Page ] = $object ;
  13004.  
  13005. parent::__construct ( $new_objects ) ;
  13006. }
  13007. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement