Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- /*
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- This code is an improved version of what can be found at:
- http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php
- Related information:
- http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- AUTHOR:
- - Webcheatsheet.com (Original code)
- - Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments)
- DESCRIPTION:
- This is a class to convert PDF files into ASCII text or so called PDF text extraction.
- It will ignore anything that is not addressed as text within the PDF and any layout.
- Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode
- HISTORY:
- v1.01 2014-02-03: Fixed some word concatenation issues
- v1.00 2010-04-17: Initial release of PDF2Text
- PURPOSE(S):
- Most likely for people that want their PDF to be searchable.
- SYNTAX:
- include('class.pdf2text.php');
- $a = new PDF2Text();
- $a->setFilename('test.pdf');
- $a->decodePDF();
- echo $a->output();
- ALTERNATIVES:
- Other excellent options to search within a PDF:
- - Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution
- - pdflib TET (http://www.pdflib.com/products/tet/)
- - Online converter: http://snowtide.com/PDFTextStream
- */
- class PDF2Text {
- // Some settings
- var $multibyte = 4; // Use setUnicode(TRUE|FALSE)
- var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
- var $showprogress = true; // TRUE if you have problems with time-out
- // Variables
- var $filename = '';
- var $decodedtext = '';
- function setFilename($filename) {
- // Reset
- $this->decodedtext = '';
- $this->filename = $filename;
- }
- function output($echo = false) {
- if($echo) echo $this->decodedtext;
- else return $this->decodedtext;
- }
- function setUnicode($input) {
- // 4 for unicode. But 2 should work in most cases just fine
- if($input == true) $this->multibyte = 4;
- else $this->multibyte = 2;
- }
- function decodePDF() {
- // Read the data from pdf file
- $infile = @file_get_contents($this->filename, FILE_BINARY);
- if (empty($infile))
- return "";
- // Get all text data.
- $transformations = array();
- $texts = array();
- // Get the list of all objects.
- preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile . "endobj\r", $objects);
- $objects = @$objects[1];
- // Select objects with streams.
- for ($i = 0; $i < count($objects); $i++) {
- $currentObject = $objects[$i];
- // Prevent time-out
- @set_time_limit ();
- if($this->showprogress) {
- // echo ". ";
- flush(); ob_flush();
- }
- // Check if an object includes data stream.
- if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject . "endstream\r", $stream )) {
- $stream = ltrim($stream[1]);
- // Check object parameters and look for text data.
- $options = $this->getObjectOptions($currentObject);
- if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])) )
- // if ( $options["Image"] && $options["Subtype"] )
- // if (!(empty($options["Length1"]) && empty($options["Subtype"])) )
- continue;
- // Hack, length doesnt always seem to be correct
- unset($options["Length"]);
- // So, we have text data. Decode it.
- $data = $this->getDecodedStream($stream, $options);
- if (strlen($data)) {
- if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data . "ET\r", $textContainers)) {
- $textContainers = @$textContainers[1];
- $this->getDirtyTexts($texts, $textContainers);
- } else
- $this->getCharTransformations($transformations, $data);
- }
- }
- }
- // Analyze text blocks taking into account character transformations and return results.
- $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
- }
- function decodeAsciiHex($input) {
- $output = "";
- $isOdd = true;
- $isComment = false;
- for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
- $c = $input[$i];
- if($isComment) {
- if ($c == '\r' || $c == '\n')
- $isComment = false;
- continue;
- }
- switch($c) {
- case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
- case '%':
- $isComment = true;
- break;
- default:
- $code = hexdec($c);
- if($code === 0 && $c != '0')
- return "";
- if($isOdd)
- $codeHigh = $code;
- else
- $output .= chr($codeHigh * 16 + $code);
- $isOdd = !$isOdd;
- break;
- }
- }
- if($input[$i] != '>')
- return "";
- if($isOdd)
- $output .= chr($codeHigh * 16);
- return $output;
- }
- function decodeAscii85($input) {
- $output = "";
- $isComment = false;
- $ords = array();
- for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
- $c = $input[$i];
- if($isComment) {
- if ($c == '\r' || $c == '\n')
- $isComment = false;
- continue;
- }
- if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
- continue;
- if ($c == '%') {
- $isComment = true;
- continue;
- }
- if ($c == 'z' && $state === 0) {
- $output .= str_repeat(chr(0), 4);
- continue;
- }
- if ($c < '!' || $c > 'u')
- return "";
- $code = ord($input[$i]) & 0xff;
- $ords[$state++] = $code - ord('!');
- if ($state == 5) {
- $state = 0;
- for ($sum = 0, $j = 0; $j < 5; $j++)
- $sum = $sum * 85 + $ords[$j];
- for ($j = 3; $j >= 0; $j--)
- $output .= chr($sum >> ($j * 8));
- }
- }
- if ($state === 1)
- return "";
- elseif ($state > 1) {
- for ($i = 0, $sum = 0; $i < $state; $i++)
- $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
- for ($i = 0; $i < $state - 1; $i++) {
- try {
- if(false == ($o = chr($sum >> ((3 - $i) * 8)))) {
- throw new Exception('Error');
- }
- $output .= $o;
- } catch (Exception $e) { /*Dont do anything*/ }
- }
- }
- return $output;
- }
- function decodeFlate($data) {
- return @gzuncompress($data);
- }
- function getObjectOptions($object) {
- $options = array();
- if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
- $options = explode("/", $options[1]);
- @array_shift($options);
- $o = array();
- for ($j = 0; $j < @count($options); $j++) {
- $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
- if (strpos($options[$j], " ") !== false) {
- $parts = explode(" ", $options[$j]);
- $o[$parts[0]] = $parts[1];
- } else
- $o[$options[$j]] = true;
- }
- $options = $o;
- unset($o);
- }
- return $options;
- }
- function getDecodedStream($stream, $options) {
- $data = "";
- if (empty($options["Filter"]))
- $data = $stream;
- else {
- $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
- $_stream = substr($stream, 0, $length);
- foreach ($options as $key => $value) {
- if ($key == "ASCIIHexDecode")
- $_stream = $this->decodeAsciiHex($_stream);
- elseif ($key == "ASCII85Decode")
- $_stream = $this->decodeAscii85($_stream);
- elseif ($key == "FlateDecode")
- $_stream = $this->decodeFlate($_stream);
- elseif ($key == "Crypt") { // TO DO
- }
- }
- $data = $_stream;
- }
- return $data;
- }
- function getDirtyTexts(&$texts, $textContainers) {
- for ($j = 0; $j < count($textContainers); $j++) {
- if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
- $texts = array_merge($texts, array(@implode('', $parts[1])));
- elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
- $texts = array_merge($texts, array(@implode('', $parts[1])));
- elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
- $texts = array_merge($texts, array(@implode('', $parts[1])));
- }
- }
- function getCharTransformations(&$transformations, $stream) {
- preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
- preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
- for ($j = 0; $j < count($chars); $j++) {
- $count = $chars[$j][1];
- $current = explode("\n", trim($chars[$j][2]));
- for ($k = 0; $k < $count && $k < count($current); $k++) {
- if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
- $transformations[str_pad($map[1], 4, "0")] = $map[2];
- }
- }
- for ($j = 0; $j < count($ranges); $j++) {
- $count = $ranges[$j][1];
- $current = explode("\n", trim($ranges[$j][2]));
- for ($k = 0; $k < $count && $k < count($current); $k++) {
- if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
- $from = hexdec($map[1]);
- $to = hexdec($map[2]);
- $_from = hexdec($map[3]);
- for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
- $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
- } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
- $from = hexdec($map[1]);
- $to = hexdec($map[2]);
- $parts = preg_split("#\s+#", trim($map[3]));
- for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
- $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
- }
- }
- }
- }
- function getTextUsingTransformations($texts, $transformations) {
- $document = "";
- for ($i = 0; $i < count($texts); $i++) {
- $isHex = false;
- $isPlain = false;
- $hex = "";
- $plain = "";
- for ($j = 0; $j < strlen($texts[$i]); $j++) {
- $c = $texts[$i][$j];
- switch($c) {
- case "<":
- $hex = "";
- $isHex = true;
- $isPlain = false;
- break;
- case ">":
- $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
- for ($k = 0; $k < count($hexs); $k++) {
- $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
- if (isset($transformations[$chex]))
- $chex = $transformations[$chex];
- $document .= html_entity_decode("&#x".$chex.";");
- }
- $isHex = false;
- break;
- case "(":
- $plain = "";
- $isPlain = true;
- $isHex = false;
- break;
- case ")":
- $document .= $plain;
- $isPlain = false;
- break;
- case "\\":
- $c2 = $texts[$i][$j + 1];
- if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
- elseif ($c2 == "n") $plain .= '\n';
- elseif ($c2 == "r") $plain .= '\r';
- elseif ($c2 == "t") $plain .= '\t';
- elseif ($c2 == "b") $plain .= '\b';
- elseif ($c2 == "f") $plain .= '\f';
- elseif ($c2 >= '0' && $c2 <= '9') {
- $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
- $j += strlen($oct) - 1;
- $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
- }
- $j++;
- break;
- default:
- if ($isHex)
- $hex .= $c;
- elseif ($isPlain)
- $plain .= $c;
- break;
- }
- }
- $document .= "\n";
- }
- return $document;
- }
- }
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement