diff --git a/lib/pdf-reader-turtletext.rb b/lib/pdf-reader-turtletext.rb index 2e8d04b..877bb4c 100644 --- a/lib/pdf-reader-turtletext.rb +++ b/lib/pdf-reader-turtletext.rb @@ -1,6 +1,7 @@ require 'pdf-reader' require 'pdf/reader/patch/object_hash' require 'pdf/reader/positional_text_receiver' +require 'pdf/reader/transposed_positional_text_receiver' require 'pdf/reader/turtletext' require 'pdf/reader/turtletext/version' diff --git a/lib/pdf/reader/transposed_positional_text_receiver.rb b/lib/pdf/reader/transposed_positional_text_receiver.rb new file mode 100644 index 0000000..aeb990f --- /dev/null +++ b/lib/pdf/reader/transposed_positional_text_receiver.rb @@ -0,0 +1,31 @@ +# Receiver to access positional (x,y) text content from a PDF +# +# Typical usage: +# +# reader = PDF::Reader.new(filename) +# receiver = PDF::Reader::TransposedPositionalTextReceiver.new +# reader.page(page).walk(receiver) +# receiver.content +# +class PDF::Reader::TransposedPositionalTextReceiver < PDF::Reader::PositionalTextReceiver + + # record text that is drawn on the page + def show_text(string) # Tj + raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil? + newx, newy = @state.trm_transform(0,0) + @content[newx] ||= {} + @content[newx][newy] ||= '' + @content[newx][newy] << @state.current_font.to_utf8(string) + end + + # override PageTextReceiver content accessor . + # Returns a hash of positional text: + # { + # x_coord=>{y_coord=>text, y_coord=>text }, + # x_coord=>{y_coord=>text, y_coord=>text } + # } + def content + super + end + +end diff --git a/lib/pdf/reader/turtletext.rb b/lib/pdf/reader/turtletext.rb index d5a85cb..c6e2666 100644 --- a/lib/pdf/reader/turtletext.rb +++ b/lib/pdf/reader/turtletext.rb @@ -18,6 +18,7 @@ class PDF::Reader::Turtletext # +source+ is a file name or stream-like object # Supported +options+ include: # * :y_precision + # * :transpose_coordinates def initialize(source, options={}) @options = options @reader = PDF::Reader.new(source) @@ -32,6 +33,14 @@ def y_precision options[:y_precision] ||= 3 end + # Returns whether or not the coordinates of the text objects are transposed. + # This is an option specified at object creation time that will essentially swap + # X and Y coordinates of all the text objects. This can improve the structured content + # if the PDF document is oriented in landscape. + def coordinates_transposed? + options[:transpose_coordinates] ||= false + end + # Returns positional (with fuzzed y positioning) text content collection as a hash: # [ fuzzed_y_position, [[x_position,content]] ] def content(page=1) @@ -153,7 +162,9 @@ def bounding_box(&block) private def load_content(page) - receiver = PDF::Reader::PositionalTextReceiver.new + receiver = (coordinates_transposed? ? + PDF::Reader::TransposedPositionalTextReceiver.new : + PDF::Reader::PositionalTextReceiver.new) reader.page(page).walk(receiver) receiver.content end