From 9789dd554ae58aad6fd1ddb06ae953917a370b2f Mon Sep 17 00:00:00 2001
From: Dermot Haughey <hderms@gmail.com>
Date: Thu, 8 Nov 2012 11:38:51 -0600
Subject: [PATCH] Add test cases and make modification

Cleaning up code

Flatten the return value

Make it not add the path to the return value if an exception-worthy
event occurred. Instead, merely raise that exception

Make text_extractor also return paths to processed files

Make function extract_images always return array of image paths

Refine specs

Fix tests

Add nil check

Refactor tests to better isolate functionality

remove debugger

remove logger

Add printf debugging

Sanity checking

Printfs

Remove puts

Remove annoying line

Cleanup

Fix unnecessary usage of ternary operation to 'wrap' an Array and
replaced with Array() as it is more idiomatic

revert to original
---
 lib/docsplit/command_line.rb     |  2 +-
 lib/docsplit/image_extractor.rb  | 18 +++++++++++----
 lib/docsplit/text_extractor.rb   | 38 ++++++++++++++++++++++----------
 test/unit/test_extract_images.rb | 13 +++++++++++
 test/unit/test_extract_text.rb   | 15 ++++++++++---
 5 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 8d48500..aa6e7ce 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -116,4 +116,4 @@ def parse_options
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 8c29bbc..f6ef086 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -13,13 +13,15 @@ class ImageExtractor
     def extract(pdfs, options)
       @pdfs = [pdfs].flatten
       extract_options(options)
+      images = []
       @pdfs.each do |pdf|
         previous = nil
         @sizes.each_with_index do |size, i|
-          @formats.each {|format| convert(pdf, size, format, previous) }
+          images += @formats.map {|format| convert(pdf, size, format, previous) }
           previous = size if @rolling
         end
       end
+       return images.reject{|i| i.nil? or i.empty?}.flatten
     end
 
     # Convert a single PDF into page images at the specified size and format.
@@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil)
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
-      escaped_pdf = ESCAPE[pdf]
+      escaped_pdf =  ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      image_paths = []
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
-        raise ExtractionFailed, result if $? != 0
+        if $? != 0
+        raise ExtractionFailed, result 
+        end
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          if $? != 0
+          raise ExtractionFailed, result 
+          else
+            image_paths << out_file
+          end
         end
+        return image_paths
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0d55f32..bcb2ba1 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -29,18 +29,23 @@ def initialize
     def extract(pdfs, opts)
       extract_options opts
       FileUtils.mkdir_p @output unless File.exists?(@output)
+      pdfs = Array(pdfs)
+      paths = []
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
         pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
-        if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
-          extract_from_ocr(pdf, pages)
-        else
-          extract_from_pdf(pdf, pages)
-          if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
-            extract_from_ocr(pdf, @pages_to_ocr)
-          end
-        end
+        return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
+                         extract_from_ocr(pdf, pages)
+                       else
+                         if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+                           extract_from_ocr(pdf, @pages_to_ocr)
+                         else
+                           extract_from_pdf(pdf, pages)
+                         end
+                       end
+        paths << return_value
       end
+      return paths.flatten.compact
     end
 
     # Does a PDF have any text embedded?
@@ -52,7 +57,7 @@ def contains_text?(pdf)
     # Extract a page range worth of text from a PDF, directly.
     def extract_from_pdf(pdf, pages)
       return extract_full(pdf) unless pages
-      pages.each {|page| extract_page(pdf, page) }
+      pages.map {|page| extract_page(pdf, page) }
     end
 
     # Extract a page range worth of text from a PDF via OCR.
@@ -60,6 +65,7 @@ def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      paths = []
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
@@ -67,7 +73,9 @@ def extract_from_ocr(pdf, pages)
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
-          clean_text(file + '.txt') if @clean_ocr
+          file_name = file + '.txt'
+          paths << file_name
+          clean_text(file_name) if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
@@ -75,8 +83,11 @@ def extract_from_ocr(pdf, pages)
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        file_name = base_path + '.txt'
+        paths << file_name
+        clean_text(file_name) if @clean_ocr
       end
+      return paths
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
@@ -104,6 +115,7 @@ def run(command)
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
       run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      return text_path
     end
 
     # Extract the contents of a single page of text, directly, adding it to
@@ -111,9 +123,11 @@ def extract_full(pdf)
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
       run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end
+      return text_path
     end
 
     def extract_options(options)
@@ -127,4 +141,4 @@ def extract_options(options)
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb
index 08c0b52..65647c9 100755
--- a/test/unit/test_extract_images.rb
+++ b/test/unit/test_extract_images.rb
@@ -13,6 +13,19 @@ def test_image_formatting
     assert Dir["#{OUTPUT}/*.jpg"].length == 2
   end
 
+  def test_return_value
+    return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
+    assert return_value.length == 1
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|el| el =~ /\.gif/}
+    return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT)
+    assert return_value.length == 2
+    assert return_value.is_a?(Enumerable)
+    assert return_value.any?{|el| el =~ /\.gif/}
+    assert return_value.any?{|el| el =~ /\.jpg/}
+  end
+
+
   def test_page_ranges
     Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index 69ccb5a..ffb089e 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -4,9 +4,12 @@
 class ExtractTextTest < Test::Unit::TestCase
 
   def test_paged_extraction
-    Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
+    return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
     assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|val| val =~ /\.txt/}
+    assert return_value.length == 2
   end
 
   def test_page_only_extraction
@@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction
   end
 
   def test_unicode_extraction
-    Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
+    return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 3
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?{|val| val =~ /\.txt/}
+    assert return_value.length == 3
   end
 
   def test_ocr_extraction
-    Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
+    return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
     4.times do |i|
       file = "corrosion_#{i + 1}.txt"
       assert_directory_contains(OUTPUT, file)
       assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
     end
+    assert return_value.is_a?(Enumerable)
+    assert return_value.all?(/\.txt/)
   end
 
+
   def test_ocr_extraction_in_mock_language
     exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"