Advertisement
Guest User

converter

a guest
Sep 22nd, 2014
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.36 KB | None | 0 0
  1. <?php
  2.  
  3. $result = pdf2text ('2D Digital Animation NC III.pdf')
  4. echo $result;
  5.  
  6. function decodeAsciiHex($input) {
  7. $output = "";
  8.  
  9. $isOdd = true;
  10. $isComment = false;
  11.  
  12. for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
  13. $c = $input[$i];
  14.  
  15. if($isComment) {
  16. if ($c == '\r' || $c == '\n')
  17. $isComment = false;
  18. continue;
  19. }
  20.  
  21. switch($c) {
  22. case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
  23. case '%':
  24. $isComment = true;
  25. break;
  26.  
  27. default:
  28. $code = hexdec($c);
  29. if($code === 0 && $c != '0')
  30. return "";
  31.  
  32. if($isOdd)
  33. $codeHigh = $code;
  34. else
  35. $output .= chr($codeHigh * 16 + $code);
  36.  
  37. $isOdd = !$isOdd;
  38. break;
  39. }
  40. }
  41.  
  42. if($input[$i] != '>')
  43. return "";
  44.  
  45. if($isOdd)
  46. $output .= chr($codeHigh * 16);
  47.  
  48. return $output;
  49. }
  50. function decodeAscii85($input) {
  51. $output = "";
  52.  
  53. $isComment = false;
  54. $ords = array();
  55.  
  56. for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
  57. $c = $input[$i];
  58.  
  59. if($isComment) {
  60. if ($c == '\r' || $c == '\n')
  61. $isComment = false;
  62. continue;
  63. }
  64.  
  65. if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
  66. continue;
  67. if ($c == '%') {
  68. $isComment = true;
  69. continue;
  70. }
  71. if ($c == 'z' && $state === 0) {
  72. $output .= str_repeat(chr(0), 4);
  73. continue;
  74. }
  75. if ($c < '!' || $c > 'u')
  76. return "";
  77.  
  78. $code = ord($input[$i]) & 0xff;
  79. $ords[$state++] = $code - ord('!');
  80.  
  81. if ($state == 5) {
  82. $state = 0;
  83. for ($sum = 0, $j = 0; $j < 5; $j++)
  84. $sum = $sum * 85 + $ords[$j];
  85. for ($j = 3; $j >= 0; $j--)
  86. $output .= chr($sum >> ($j * 8));
  87. }
  88. }
  89. if ($state === 1)
  90. return "";
  91. elseif ($state > 1) {
  92. for ($i = 0, $sum = 0; $i < $state; $i++)
  93. $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
  94. for ($i = 0; $i < $state - 1; $i++)
  95. $ouput .= chr($sum >> ((3 - $i) * 8));
  96. }
  97.  
  98. return $output;
  99. }
  100. function decodeFlate($input) {
  101. return @gzuncompress($input);
  102. }
  103.  
  104. function getObjectOptions($object) {
  105. $options = array();
  106. if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
  107. $options = explode("/", $options[1]);
  108. @array_shift($options);
  109.  
  110. $o = array();
  111. for ($j = 0; $j < @count($options); $j++) {
  112. $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
  113. if (strpos($options[$j], " ") !== false) {
  114. $parts = explode(" ", $options[$j]);
  115. $o[$parts[0]] = $parts[1];
  116. } else
  117. $o[$options[$j]] = true;
  118. }
  119. $options = $o;
  120. unset($o);
  121. }
  122.  
  123. return $options;
  124. }
  125. function getDecodedStream($stream, $options) {
  126. $data = "";
  127. if (empty($options["Filter"]))
  128. $data = $stream;
  129. else {
  130. $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
  131. $_stream = substr($stream, 0, $length);
  132.  
  133. foreach ($options as $key => $value) {
  134. if ($key == "ASCIIHexDecode")
  135. $_stream = decodeAsciiHex($_stream);
  136. if ($key == "ASCII85Decode")
  137. $_stream = decodeAscii85($_stream);
  138. if ($key == "FlateDecode")
  139. $_stream = decodeFlate($_stream);
  140. }
  141. $data = $_stream;
  142. }
  143. return $data;
  144. }
  145. function getDirtyTexts(&$texts, $textContainers) {
  146. for ($j = 0; $j < count($textContainers); $j++) {
  147. if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
  148. $texts = array_merge($texts, @$parts[1]);
  149. elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
  150. $texts = array_merge($texts, @$parts[1]);
  151. }
  152. }
  153. function getCharTransformations(&$transformations, $stream) {
  154. preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
  155. preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
  156.  
  157. for ($j = 0; $j < count($chars); $j++) {
  158. $count = $chars[$j][1];
  159. $current = explode("\n", trim($chars[$j][2]));
  160. for ($k = 0; $k < $count && $k < count($current); $k++) {
  161. if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
  162. $transformations[str_pad($map[1], 4, "0")] = $map[2];
  163. }
  164. }
  165. for ($j = 0; $j < count($ranges); $j++) {
  166. $count = $ranges[$j][1];
  167. $current = explode("\n", trim($ranges[$j][2]));
  168. for ($k = 0; $k < $count && $k < count($current); $k++) {
  169. if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
  170. $from = hexdec($map[1]);
  171. $to = hexdec($map[2]);
  172. $_from = hexdec($map[3]);
  173.  
  174. for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
  175. $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
  176. } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
  177. $from = hexdec($map[1]);
  178. $to = hexdec($map[2]);
  179. $parts = preg_split("#\s+#", trim($map[3]));
  180.  
  181. for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
  182. $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
  183. }
  184. }
  185. }
  186. }
  187. function getTextUsingTransformations($texts, $transformations) {
  188. $document = "";
  189. for ($i = 0; $i < count($texts); $i++) {
  190. $isHex = false;
  191. $isPlain = false;
  192.  
  193. $hex = "";
  194. $plain = "";
  195. for ($j = 0; $j < strlen($texts[$i]); $j++) {
  196. $c = $texts[$i][$j];
  197. switch($c) {
  198. case "<":
  199. $hex = "";
  200. $isHex = true;
  201. break;
  202. case ">":
  203. $hexs = str_split($hex, 4);
  204. for ($k = 0; $k < count($hexs); $k++) {
  205. $chex = str_pad($hexs[$k], 4, "0");
  206. if (isset($transformations[$chex]))
  207. $chex = $transformations[$chex];
  208. $document .= html_entity_decode("&#x".$chex.";");
  209. }
  210. $isHex = false;
  211. break;
  212. case "(":
  213. $plain = "";
  214. $isPlain = true;
  215. break;
  216. case ")":
  217. $document .= $plain;
  218. $isPlain = false;
  219. break;
  220. case "\\":
  221. $c2 = $texts[$i][$j + 1];
  222. if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
  223. elseif ($c2 == "n") $plain .= '\n';
  224. elseif ($c2 == "r") $plain .= '\r';
  225. elseif ($c2 == "t") $plain .= '\t';
  226. elseif ($c2 == "b") $plain .= '\b';
  227. elseif ($c2 == "f") $plain .= '\f';
  228. elseif ($c2 >= '0' && $c2 <= '9') {
  229. $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
  230. $j += strlen($oct) - 1;
  231. $plain .= html_entity_decode("&#".octdec($oct).";");
  232. }
  233. $j++;
  234. break;
  235.  
  236. default:
  237. if ($isHex)
  238. $hex .= $c;
  239. if ($isPlain)
  240. $plain .= $c;
  241. break;
  242. }
  243. }
  244. $document .= "\n";
  245. }
  246.  
  247. return $document;
  248. }
  249.  
  250. function pdf2text($filename) {
  251. $infile = @file_get_contents($filename, FILE_BINARY);
  252. if (empty($infile))
  253. return "";
  254.  
  255. $transformations = array();
  256. $texts = array();
  257.  
  258. preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
  259. $objects = @$objects[1];
  260.  
  261. for ($i = 0; $i < count($objects); $i++) {
  262. $currentObject = $objects[$i];
  263.  
  264. if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
  265. $stream = ltrim($stream[1]);
  266.  
  267. $options = getObjectOptions($currentObject);
  268. if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
  269. continue;
  270.  
  271. $data = getDecodedStream($stream, $options);
  272. if (strlen($data)) {
  273. if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
  274. $textContainers = @$textContainers[1];
  275. getDirtyTexts($texts, $textContainers);
  276. } else
  277. getCharTransformations($transformations, $data);
  278. }
  279. }
  280. }
  281.  
  282. return getTextUsingTransformations($texts, $transformations);
  283. }
  284. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement