fourdigits · DanielSwain · Apr 15, 2019 · Apr 15, 2019 · Apr 16, 2019 · Apr 16, 2019
diff --git a/README.md b/README.md
@@ -73,13 +73,26 @@ Create a `tessdata` directory in your project directory, and download the
 Transcription is done automatically after Document save,
 in an [`asyncio`][7] executor to prevent blocking the response during processing.
 
-To transcribe all existing Documents, run the management command::
+### Transcribe existing documents
+
+To transcribe all existing Documents, run the management command:
 
     ./manage.py transcribe_documents
 
-This may take a long time, obviously.
+Transcribing every document may take a long time.  To transcribe a subset of documents, include a slice notation that will be applied to the document queryset (the alternative `-s` syntax may also be used, e.g. `-s 4:7`):
+
+    ./manage.py transcribe_documents --slice 4:7
+
+To control the amount of text written to the terminal while transcribing, set `--verbosity` to a level between 0 and 3 (the alternative `-v` syntax may also be used, e.g. `-v 2`):
+
+    ./manage.py transcribe_documents --slice 4:7 --verbosity 2
+
+Verbosity level 0 outputs only the number of documents to be trancribed.  Verbosity level 1 also outputs the name of each document, the number of the document in the subject queryset, and the slice notation that would cause a particular document to be transcribed.  Verbosity level 2 also outputs a message when Tesseract is invoked.  Verbosity level 3 adds to the output of levels 0 through 2 by outputting the text that was transcribed for a document.
 
+To do a dry run without actually starting transcription, include the `--dry-run` flag (the alternative `-d` syntax may also be used):
 
+    ./manage.py transcribe_documents --slice 4:7 --verbosity 2 --dry-run
+
 ## Usage in custom view
 
 Here is a code example for a search view (outside Wagtail's admin interface)

diff --git a/src/wagtail_textract/handlers.py b/src/wagtail_textract/handlers.py
@@ -6,12 +6,13 @@
 loop = asyncio.get_event_loop()
 
 
-def transcribe_document(document):
+def transcribe_document(document, options):
     """Store the Document file's text in the transcription field."""
     try:
         text = textract.process(document.file.path).strip()
         if not text:
-            logger.debug('No text found, falling back to tesseract.')
+            if 'verbosity' in options and options['verbosity'] >= 2:
+                print('No text found - falling back to tesseract:  {} ({})'.format(document, document.filename))
             text = textract.process(
                 document.file.path,
                 method='tesseract',
@@ -20,7 +21,7 @@ def transcribe_document(document):
     except Exception as err:
         text = None
         logger.error(
-            'Text extraction error with file {file}: {message}'.format(
+            '\n\nText extraction error with file {file}:  {message}\n\n'.format(
                 file=document.filename,
                 message=str(err),
             )
@@ -29,11 +30,13 @@ def transcribe_document(document):
     if text:
         document.transcription = text.decode()
         document.save(transcribe=False)
-        print("Saved transcription: %s" % text)
+        if 'verbosity' in options and options['verbosity'] == 3:
+            print("Saved transcription for {}:\n{}\n".format(document, text))
     else:
-        logger.error('No text found.')
+        logger.error('No text found:  {} ({})'.format(document, document.filename))
 
 
-def async_transcribe_document(document):
+def async_transcribe_document(document, options):
     """Defer transcription to an asyncio executor."""
-    loop.run_in_executor(None, transcribe_document, document)
+    loop.run_in_executor(None, transcribe_document, document, options)
+
diff --git a/src/wagtail_textract/management/commands/transcribe_documents.py b/src/wagtail_textract/management/commands/transcribe_documents.py
@@ -1,15 +1,48 @@
 from django.core.management.base import BaseCommand
-
 from wagtail.documents.models import get_document_model
-
 from wagtail_textract.handlers import async_transcribe_document
 
 
 class Command(BaseCommand):
     """Extract text from all Documents."""
+    help = 'Extract text from Documents'
 
+    def add_arguments(self, parser):
+        # Named (optional) arguments
+        parser.add_argument('-s', '--slice', type=str, help="Transcribe a subset of documents using Python's basic slicing syntax")
+        parser.add_argument('-d', '--dry-run', action='store_true', dest='dry_run', help="Show what actions will be undertaken with a given transcribe command and its associated parameters")
+
     def handle(self, *args, **options):
         """Extract text from all Documents."""
-        for document in get_document_model().objects.all():
-            self.stdout.write("Transcribing %s" % document)
-            async_transcribe_document(document)
+        ctr = 1
+        slice_ctr = 0
+        if options['slice']:
+            slices = [x for x in options['slice'].split(':') if x]
+            if len(slices) == 2:
+                docs = get_document_model().objects.all().order_by('title')[int(slices[0]):int(slices[1])]
+                slice_ctr = int(slices[0])
+            elif options['slice'].startswith(':') and len(slices) == 1:
+                docs = get_document_model().objects.all().order_by('title')[:int(slices[0])]
+            elif options['slice'].endswith(':') and len(slices) == 1:
+                docs = get_document_model().objects.all().order_by('title')[int(slices[0]):]
+                slice_ctr = int(slices[0])
+            else:
+                docs = get_document_model().objects.all().order_by('title')
+        else:
+            docs = get_document_model().objects.all().order_by('title')
+
+        if options['dry_run']:
+            self.stdout.write("\n{:,} documents will be transcribed\n\n".format( docs.count()))
+        else:
+            self.stdout.write("\nStarting Transcription of {:,} documents\n\n".format( docs.count()))
+        for document in docs:
+            if options['verbosity'] >= 1:
+                print("{:,} (-s {}:{}) - {}".format(ctr, slice_ctr, slice_ctr + 1, document))
+            if not options['dry_run']:
+                async_transcribe_document(document, options)
+            ctr += 1
+            slice_ctr += 1
+        if not options['dry_run']:
+            self.stdout.write("\n{:,} documents being processed asynchonously\n\n--- AWAITING COMPLETION ---\n\n".format( docs.count()))
+        else:
+            self.stdout.write("")
diff --git a/src/wagtail_textract/models.py b/src/wagtail_textract/models.py
@@ -18,7 +18,7 @@ def save(self, **kwargs):
         transcribe = kwargs.pop('transcribe', True)
         super(TranscriptionMixin, self).save(**kwargs)
         if transcribe:
-            async_transcribe_document(self)
+            async_transcribe_document(self, None)
 
 
 class Document(TranscriptionMixin, WagtailDocument):

diff --git a/tox.ini b/tox.ini
@@ -1,28 +1,39 @@
 [tox]
 envlist =
-    py{34,35,36}-dj{20}-wt{20,21,22}
-    py{35,36}-dj{21}-wt{23}
-    py{35,36,37}-dj{21}-wt{24}
+    py{35,36}-dj{20}-wt{20,21,22}
+    py{35,36}-dj{20,21}-wt{23}
+    py{35,36,37}-dj{20,21}-wt{24}
+    py{35,36,37}-dj{20,21,22}-wt{25,26}
+    py{35,36,37,38}-dj{20,21,22}-wt{27}
+    py{36,37,38}-dj{21,22,30}-wt{28}
+    py{36,37,38}-dj{22,30}-wt{29}    
 
 [testenv]
 basepython =
-    py34: python3.4
-    py35: python3.5
+    py35: python3.5    
     py36: python3.6
     py37: python3.7
+    py38: python3.8
+
 
 deps =
     pytest
     pytest-django
     coverage
     codecov
     dj20: Django>=2.0,<2.1
-    dj21: Django>=2.1,<2.2
+    dj21: Django>=2.1,<2.2    
+    dj22: Django>=2.2,<2.3
+    dj30: Django>=3.0,<3.1    
     wt20: wagtail>=2.0,<2.1
     wt21: wagtail>=2.1,<2.2
     wt22: wagtail>=2.2,<2.3
     wt23: wagtail>=2.3,<2.4
     wt24: wagtail>=2.4,<2.5
+    wt25: wagtail>=2.5,<2.6
+    wt26: wagtail>=2.6,<2.7
+    wt27: wagtail>=2.7,<2.8
+    wt28: wagtail>=2.8,<2.9    
 
 whitelist_externals =
     make

diff --git a/travis-textract-requirements/python.txt b/travis-textract-requirements/python.txt
@@ -12,5 +12,5 @@ xlrd==1.0.0
 EbookLib==0.16
 SpeechRecognition==3.7.1
 https://github.com/mattgwwalker/msg-extractor/zipball/master
-six==1.10.0
+six>=1.11,<2.0
 pocketsphinx==0.1.3