Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'pdf/reader'
- require 'json'
- require 'forwardable'
- class JsonTextReceiver
- extend Forwardable
- SPACE = " "
- attr_reader :state, :options, :characters
- ########## BEGIN FORWARDERS ##########
- # Graphics State Operators
- def_delegators :@state, :save_graphics_state, :restore_graphics_state
- # Matrix Operators
- def_delegators :@state, :concatenate_matrix
- # Text Object Operators
- def_delegators :@state, :begin_text_object, :end_text_object
- # Text State Operators
- def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling
- def_delegators :@state, :set_text_font_and_size, :font_size
- def_delegators :@state, :set_text_leading, :set_text_rendering_mode
- def_delegators :@state, :set_text_rise, :set_word_spacing
- # Text Positioning Operators
- def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
- def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
- ########## END FORWARDERS ##########
- # starting a new page
- def page=(page)
- @state = PDF::Reader::PageState.new(page)
- @content = []
- @characters = []
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
- end
- #####################################################
- # Text Showing Operators
- #####################################################
- # record text that is drawn on the page
- def show_text(string) # Tj (AWAY)
- internal_show_text(string)
- end
- def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
- params.each do |arg|
- if arg.is_a?(String)
- internal_show_text(arg)
- else
- @state.process_glyph_displacement(0, arg, false)
- end
- end
- end
- def move_to_next_line_and_show_text(str) # '
- @state.move_to_start_of_next_line
- show_text(str)
- end
- def set_spacing_next_line_show_text(aw, ac, string) # "
- @state.set_word_spacing(aw)
- @state.set_character_spacing(ac)
- move_to_next_line_and_show_text(string)
- end
- #####################################################
- # XObjects
- #####################################################
- def invoke_xobject(label)
- @state.invoke_xobject(label) do |xobj|
- case xobj
- when PDF::Reader::FormXObject then
- xobj.walk(self)
- end
- end
- end
- private
- def internal_show_text(string)
- if @state.current_font.nil?
- raise PDF::Reader::MalformedPDFError, "current font is invalid"
- end
- glyphs = @state.current_font.unpack(string)
- glyphs.each_with_index do |glyph_code, index|
- # paint the current glyph
- newx, newy = @state.trm_transform(0,0)
- utf8_chars = @state.current_font.to_utf8(glyph_code)
- # apply to glyph displacment for the current glyph so the next
- # glyph will appear in the correct position
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
- th = 1
- scaled_glyph_width = glyph_width * @state.font_size * th
- unless utf8_chars == SPACE
- @characters << {x: newx, y: newy, scaled_font_size: scaled_glyph_width, font_size: @state.font_size, character: utf8_chars}
- end
- @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
- end
- end
- end
- PDF::Reader.open("WFD_I15_VCQM2D.pdf") do |reader|
- reader.pages.each do |page|
- receiver = JsonTextReceiver.new
- page.walk(receiver)
- puts JSON.pretty_generate(receiver.characters)
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement