diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 7c7af08..1ab580f 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -91,6 +91,9 @@ def parse_options opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o| @options[:ocr] = o end + opts.on('--hocr', 'force hOCR output when OCR enabled') do |h| + @options[:hocr] = h + end opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c| @options[:clean] = false end @@ -119,4 +122,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..422a270 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -60,21 +60,25 @@ def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] + additional_opts = "" + additional_opts += "hocr " if @use_hocr if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" + run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{additional_opts} 2>&1" clean_text(file + '.txt') if @clean_ocr + run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @use_hocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" + run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{additional_opts} 2>&1" + run "cp #{escaped_tiff} #{base_path}.tif" if @use_hocr clean_text(base_path + '.txt') if @clean_ocr end ensure @@ -120,11 +124,12 @@ def extract_options(options) @output = options[:output] || '.' @pages = options[:pages] @force_ocr = options[:ocr] == true + @use_hocr = options[:hocr] == true @forbid_ocr = options[:ocr] == false - @clean_ocr = !(options[:clean] == false) + @clean_ocr = !(options[:clean] == false) && !@use_hocr @language = options[:language] || 'eng' end end -end \ No newline at end of file +end