Guest User

Untitled

a guest
May 22nd, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.15 KB | None | 0 0
  1. # Depends on working pdftk, gm (GraphicsMagick), and pdftotext (Poppler) commands.
  2. # Splits a pdf into batches of N pages, creates their thumbnails and icons,
  3. # as specified in the Job options, gets the text for every page, and merges
  4. # it all back into a tar archive for convenient download.
  5. #
  6. # See <tt>examples/process_pdfs_example.rb</tt> for more information.
  7. class ProcessPdfs < CloudCrowd::Action
  8.  
  9. # Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
  10. # chunks for processing. The double pdftk shuffle fixes the document xrefs.
  11. def split
  12. `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
  13. FileUtils.rm input_path
  14. pdfs = Dir["*.pdf_temp"]
  15. pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`}
  16. pdfs = Dir["*.pdf"]
  17. batch_size = options['batch_size']
  18. batches = (pdfs.length / batch_size.to_f).ceil
  19. batches.times do |batch_num|
  20. tar_path = "#{sprintf('%05d', batch_num)}.tar"
  21. batch_pdfs = pdfs[batch_num*batch_size...(batch_num + 1)*batch_size]
  22. `tar -czf #{tar_path} #{batch_pdfs.join(' ')}`
  23. end
  24. Dir["*.tar"].map {|tar| save(tar) }.to_json
  25. end
  26.  
  27. # Convert a pdf page into different-sized thumbnails. Grab the text.
  28. def process
  29. `tar -xzf #{input_path}`
  30. FileUtils.rm input_path
  31. cmds = []
  32. generate_images_commands(cmds)
  33. generate_text_commands(cmds)
  34. system cmds.join(' && ')
  35. FileUtils.rm Dir['*.pdf']
  36. `tar -czf #{file_name}.tar *`
  37. save("#{file_name}.tar")
  38. end
  39.  
  40. # Merge all of the resulting images, all of the resulting text files, and
  41. # the concatenated merge of the full-text into a single tar archive, ready to
  42. # for download.
  43. def merge
  44. input.each do |batch_url|
  45. batch_path = File.basename(batch_url)
  46. download(batch_url, batch_path)
  47. `tar -xzf #{batch_path}`
  48. FileUtils.rm batch_path
  49. end
  50.  
  51. names = Dir['*.txt'].map {|fn| fn.sub(/_\d+(_\w+)?\.txt\Z/, '') }.uniq
  52. dirs = names.map {|n| ["#{n}/text/full", "#{n}/text/pages"] + options['images'].map {|i| "#{n}/images/#{i['name']}" } }.flatten
  53. FileUtils.mkdir_p(dirs)
  54.  
  55. Dir['*.*'].each do |file|
  56. ext = File.extname(file)
  57. name = file.sub(/_\d+(_\w+)?#{ext}\Z/, '')
  58. if ext == '.txt'
  59. FileUtils.mv(file, "#{name}/text/pages/#{file}")
  60. else
  61. suffix = file.match(/_([^_]+)#{ext}\Z/)[1]
  62. sans_suffix = file.sub(/_([^_]+)#{ext}\Z/, ext)
  63. FileUtils.mv(file, "#{name}/images/#{suffix}/#{sans_suffix}")
  64. end
  65. end
  66.  
  67. names.each {|n| `cat #{n}/text/pages/*.txt > #{n}/text/full/#{n}.txt` }
  68.  
  69. `tar -czf processed_pdfs.tar *`
  70. save("processed_pdfs.tar")
  71. end
  72.  
  73.  
  74. private
  75.  
  76. def generate_images_commands(command_list)
  77. Dir["*.pdf"].each do |pdf|
  78. name = File.basename(pdf, File.extname(pdf))
  79. options['images'].each do |i|
  80. command_list << "gm convert #{i['options']} #{pdf} #{name}_#{i['name']}.#{i['extension']}"
  81. end
  82. end
  83. end
  84.  
  85. def generate_text_commands(command_list)
  86. Dir["*.pdf"].each do |pdf|
  87. name = File.basename(pdf, File.extname(pdf))
  88. command_list << "pdftotext -enc UTF-8 -layout -q #{pdf} #{name}.txt"
  89. end
  90. end
  91.  
  92. end
Add Comment
Please, Sign In to add comment