Advertisement
planzelle

PHP script to convert PDF to CSV (pdftotext)

Feb 24th, 2016
450
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 1.86 KB | None | 0 0
  1. <?php
  2. /**
  3.  * convert pdf file to csv.
  4.  *
  5.  * Shell command to convert pdf to text file:
  6.  * pdftotext -layout file.pdf
  7.  *
  8.  * The resulting file will be parsed and converted to a csv file
  9.  *
  10.  * Example rows of interest (footer and header lines will be ignored):
  11.  * 98458995   21.02.2016            10.06.2016/DOE/JOHN                                      160,96
  12.  * The line has a trailing linefeed !
  13.  *
  14.  */
  15. function printHeader() {
  16.     echo <<< HEADER
  17. <!doctype html>
  18. <html>
  19. <head>
  20.     <meta charset="utf-8">
  21.     <title>PDF2CSV-Converter</title>
  22. </head>
  23. <body>
  24.     <h1>PDF file upload</h1>
  25.     <form enctype="multipart/form-data" method="POST">
  26.         <input type="file" name="pdf" /><br />
  27.         <input type="submit" value="Upload now" />
  28.     </form>
  29.  
  30. HEADER;
  31. }
  32.  
  33.  
  34. function printFooter() {
  35.     echo <<< FOOTER
  36. </body>
  37. </html>
  38. FOOTER;
  39. }
  40.  
  41.  
  42. printHeader();
  43. $baseName = "file";
  44. $pdfFile = $baseName.'.pdf';
  45. $inFile = $baseName.'.txt';
  46. $outFile = $baseName.'.csv';
  47. @unlink($pdfFile);
  48. @unlink($inFile);
  49. @unlink($outFile);
  50.  
  51. if (array_key_exists('pdf', $_FILES)) {
  52.     $ok = move_uploaded_file($_FILES['pdf']['tmp_name'], $pdfFile);
  53.     if ($ok) {
  54.         // convert pdf to txt
  55.         $output = shell_exec("pdftotext -layout $pdfFile $inFile");
  56.         // use a pattern to identify tabular data of interest
  57.         $pattern = "/^(\d{8})\s{2,}(\d{2}\.\d{2}\.\d{4})\s{2,}/"; // e.g. "98458995   21.02.2016       "
  58.         $content = file($inFile);
  59.         $fh = fopen($outFile, "w");
  60.         foreach($content as $line) {
  61.             $match = array();
  62.             if (preg_match($pattern, $line, $match)) {
  63.                 // split cols - in this case multiple spaces
  64.                 $cols = preg_split("/\s{2,}/", $line);
  65.                 // write line to file
  66.                 $csv = sprintf("%s;%s;%s;%s", $cols[0], $cols[1], $cols[2], $cols[3]);
  67.                 fputs($fh, $csv);
  68.             }
  69.         }
  70.         fclose($fh);
  71.         echo '<hr /><a href="'.$outFile.'">Download '.$outFile.'</a><hr />';
  72.     }//if ok
  73. }
  74.  
  75. printFooter();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement