From 9789dd554ae58aad6fd1ddb06ae953917a370b2f Mon Sep 17 00:00:00 2001 From: Dermot Haughey Date: Thu, 8 Nov 2012 11:38:51 -0600 Subject: [PATCH] Add test cases and make modification Cleaning up code Flatten the return value Make it not add the path to the return value if an exception-worthy event occurred. Instead, merely raise that exception Make text_extractor also return paths to processed files Make function extract_images always return array of image paths Refine specs Fix tests Add nil check Refactor tests to better isolate functionality remove debugger remove logger Add printf debugging Sanity checking Printfs Remove puts Remove annoying line Cleanup Fix unnecessary usage of ternary operation to 'wrap' an Array and replaced with Array() as it is more idiomatic revert to original --- lib/docsplit/command_line.rb | 2 +- lib/docsplit/image_extractor.rb | 18 +++++++++++---- lib/docsplit/text_extractor.rb | 38 ++++++++++++++++++++++---------- test/unit/test_extract_images.rb | 13 +++++++++++ test/unit/test_extract_text.rb | 15 ++++++++++--- 5 files changed, 66 insertions(+), 20 deletions(-) diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 8d48500..aa6e7ce 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -116,4 +116,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..f6ef086 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -13,13 +13,15 @@ class ImageExtractor def extract(pdfs, options) @pdfs = [pdfs].flatten extract_options(options) + images = [] @pdfs.each do |pdf| previous = nil @sizes.each_with_index do |size, i| - @formats.each {|format| convert(pdf, size, format, previous) } + images += @formats.map {|format| convert(pdf, size, format, previous) } previous = size if @rolling end end + return images.reject{|i| i.nil? or i.empty?}.flatten end # Convert a single PDF into page images at the specified size and format. @@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil) basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s - escaped_pdf = ESCAPE[pdf] + escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + image_paths = [] if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp - raise ExtractionFailed, result if $? != 0 + if $? != 0 + raise ExtractionFailed, result + end else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp - raise ExtractionFailed, result if $? != 0 + if $? != 0 + raise ExtractionFailed, result + else + image_paths << out_file + end end + return image_paths end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..bcb2ba1 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -29,18 +29,23 @@ def initialize def extract(pdfs, opts) extract_options opts FileUtils.mkdir_p @output unless File.exists?(@output) + pdfs = Array(pdfs) + paths = [] [pdfs].flatten.each do |pdf| @pdf_name = File.basename(pdf, File.extname(pdf)) pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages - if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) - extract_from_ocr(pdf, pages) - else - extract_from_pdf(pdf, pages) - if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? - extract_from_ocr(pdf, @pages_to_ocr) - end - end + return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) + extract_from_ocr(pdf, pages) + else + if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? + extract_from_ocr(pdf, @pages_to_ocr) + else + extract_from_pdf(pdf, pages) + end + end + paths << return_value end + return paths.flatten.compact end # Does a PDF have any text embedded? @@ -52,7 +57,7 @@ def contains_text?(pdf) # Extract a page range worth of text from a PDF, directly. def extract_from_pdf(pdf, pages) return extract_full(pdf) unless pages - pages.each {|page| extract_page(pdf, page) } + pages.map {|page| extract_page(pdf, page) } end # Extract a page range worth of text from a PDF via OCR. @@ -60,6 +65,7 @@ def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] + paths = [] if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" @@ -67,7 +73,9 @@ def extract_from_ocr(pdf, pages) file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" - clean_text(file + '.txt') if @clean_ocr + file_name = file + '.txt' + paths << file_name + clean_text(file_name) if @clean_ocr FileUtils.remove_entry_secure tiff end else @@ -75,8 +83,11 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" - clean_text(base_path + '.txt') if @clean_ocr + file_name = base_path + '.txt' + paths << file_name + clean_text(file_name) if @clean_ocr end + return paths ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end @@ -104,6 +115,7 @@ def run(command) def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + return text_path end # Extract the contents of a single page of text, directly, adding it to @@ -111,9 +123,11 @@ def extract_full(pdf) def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end + return text_path end def extract_options(options) @@ -127,4 +141,4 @@ def extract_options(options) end -end \ No newline at end of file +end diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 08c0b52..65647c9 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -13,6 +13,19 @@ def test_image_formatting assert Dir["#{OUTPUT}/*.jpg"].length == 2 end + def test_return_value + return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) + assert return_value.length == 1 + assert return_value.is_a?(Enumerable) + assert return_value.all?{|el| el =~ /\.gif/} + return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT) + assert return_value.length == 2 + assert return_value.is_a?(Enumerable) + assert return_value.any?{|el| el =~ /\.gif/} + assert return_value.any?{|el| el =~ /\.jpg/} + end + + def test_page_ranges Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"] diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 69ccb5a..ffb089e 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -4,9 +4,12 @@ class ExtractTextTest < Test::Unit::TestCase def test_paged_extraction - Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT) + return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America") + assert return_value.is_a?(Enumerable) + assert return_value.all?{|val| val =~ /\.txt/} + assert return_value.length == 2 end def test_page_only_extraction @@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction end def test_unicode_extraction - Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT) + return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 3 + assert return_value.is_a?(Enumerable) + assert return_value.all?{|val| val =~ /\.txt/} + assert return_value.length == 3 end def test_ocr_extraction - Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT) + return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT) 4.times do |i| file = "corrosion_#{i + 1}.txt" assert_directory_contains(OUTPUT, file) assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size" end + assert return_value.is_a?(Enumerable) + assert return_value.all?(/\.txt/) end + def test_ocr_extraction_in_mock_language exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")} assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"