Skip to content
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,26 @@ Create a `tessdata` directory in your project directory, and download the
Transcription is done automatically after Document save,
in an [`asyncio`][7] executor to prevent blocking the response during processing.

To transcribe all existing Documents, run the management command::
### Transcribe existing documents

To transcribe all existing Documents, run the management command:

./manage.py transcribe_documents

This may take a long time, obviously.
Transcribing every document may take a long time. To transcribe a subset of documents, include a slice notation that will be applied to the document queryset (the alternative `-s` syntax may also be used, e.g. `-s 4:7`):

./manage.py transcribe_documents --slice 4:7

To control the amount of text written to the terminal while transcribing, set `--verbosity` to a level between 0 and 3 (the alternative `-v` syntax may also be used, e.g. `-v 2`):

./manage.py transcribe_documents --slice 4:7 --verbosity 2

Verbosity level 0 outputs only the number of documents to be trancribed. Verbosity level 1 also outputs the name of each document, the number of the document in the subject queryset, and the slice notation that would cause a particular document to be transcribed. Verbosity level 2 also outputs a message when Tesseract is invoked. Verbosity level 3 adds to the output of levels 0 through 2 by outputting the text that was transcribed for a document.

To do a dry run without actually starting transcription, include the `--dry-run` flag (the alternative `-d` syntax may also be used):

./manage.py transcribe_documents --slice 4:7 --verbosity 2 --dry-run

## Usage in custom view

Here is a code example for a search view (outside Wagtail's admin interface)
Expand Down
17 changes: 10 additions & 7 deletions src/wagtail_textract/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
loop = asyncio.get_event_loop()


def transcribe_document(document):
def transcribe_document(document, options):
"""Store the Document file's text in the transcription field."""
try:
text = textract.process(document.file.path).strip()
if not text:
logger.debug('No text found, falling back to tesseract.')
if 'verbosity' in options and options['verbosity'] >= 2:
print('No text found - falling back to tesseract: {} ({})'.format(document, document.filename))
text = textract.process(
document.file.path,
method='tesseract',
Expand All @@ -20,7 +21,7 @@ def transcribe_document(document):
except Exception as err:
text = None
logger.error(
'Text extraction error with file {file}: {message}'.format(
'\n\nText extraction error with file {file}: {message}\n\n'.format(
file=document.filename,
message=str(err),
)
Expand All @@ -29,11 +30,13 @@ def transcribe_document(document):
if text:
document.transcription = text.decode()
document.save(transcribe=False)
print("Saved transcription: %s" % text)
if 'verbosity' in options and options['verbosity'] == 3:
print("Saved transcription for {}:\n{}\n".format(document, text))
else:
logger.error('No text found.')
logger.error('No text found: {} ({})'.format(document, document.filename))


def async_transcribe_document(document):
def async_transcribe_document(document, options):
"""Defer transcription to an asyncio executor."""
loop.run_in_executor(None, transcribe_document, document)
loop.run_in_executor(None, transcribe_document, document, options)

43 changes: 38 additions & 5 deletions src/wagtail_textract/management/commands/transcribe_documents.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,48 @@
from django.core.management.base import BaseCommand

from wagtail.documents.models import get_document_model

from wagtail_textract.handlers import async_transcribe_document


class Command(BaseCommand):
"""Extract text from all Documents."""
help = 'Extract text from Documents'

def add_arguments(self, parser):
# Named (optional) arguments
parser.add_argument('-s', '--slice', type=str, help="Transcribe a subset of documents using Python's basic slicing syntax")
parser.add_argument('-d', '--dry-run', action='store_true', dest='dry_run', help="Show what actions will be undertaken with a given transcribe command and its associated parameters")

def handle(self, *args, **options):
"""Extract text from all Documents."""
for document in get_document_model().objects.all():
self.stdout.write("Transcribing %s" % document)
async_transcribe_document(document)
ctr = 1
slice_ctr = 0
if options['slice']:
slices = [x for x in options['slice'].split(':') if x]
if len(slices) == 2:
docs = get_document_model().objects.all().order_by('title')[int(slices[0]):int(slices[1])]
slice_ctr = int(slices[0])
elif options['slice'].startswith(':') and len(slices) == 1:
docs = get_document_model().objects.all().order_by('title')[:int(slices[0])]
elif options['slice'].endswith(':') and len(slices) == 1:
docs = get_document_model().objects.all().order_by('title')[int(slices[0]):]
slice_ctr = int(slices[0])
else:
docs = get_document_model().objects.all().order_by('title')
else:
docs = get_document_model().objects.all().order_by('title')

if options['dry_run']:
self.stdout.write("\n{:,} documents will be transcribed\n\n".format( docs.count()))
else:
self.stdout.write("\nStarting Transcription of {:,} documents\n\n".format( docs.count()))
for document in docs:
if options['verbosity'] >= 1:
print("{:,} (-s {}:{}) - {}".format(ctr, slice_ctr, slice_ctr + 1, document))
if not options['dry_run']:
async_transcribe_document(document, options)
ctr += 1
slice_ctr += 1
if not options['dry_run']:
self.stdout.write("\n{:,} documents being processed asynchonously\n\n--- AWAITING COMPLETION ---\n\n".format( docs.count()))
else:
self.stdout.write("")
2 changes: 1 addition & 1 deletion src/wagtail_textract/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def save(self, **kwargs):
transcribe = kwargs.pop('transcribe', True)
super(TranscriptionMixin, self).save(**kwargs)
if transcribe:
async_transcribe_document(self)
async_transcribe_document(self, None)


class Document(TranscriptionMixin, WagtailDocument):
Expand Down
23 changes: 17 additions & 6 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,28 +1,39 @@
[tox]
envlist =
py{34,35,36}-dj{20}-wt{20,21,22}
py{35,36}-dj{21}-wt{23}
py{35,36,37}-dj{21}-wt{24}
py{35,36}-dj{20}-wt{20,21,22}
py{35,36}-dj{20,21}-wt{23}
py{35,36,37}-dj{20,21}-wt{24}
py{35,36,37}-dj{20,21,22}-wt{25,26}
py{35,36,37,38}-dj{20,21,22}-wt{27}
py{36,37,38}-dj{21,22,30}-wt{28}
py{36,37,38}-dj{22,30}-wt{29}

[testenv]
basepython =
py34: python3.4
py35: python3.5
py35: python3.5
py36: python3.6
py37: python3.7
py38: python3.8


deps =
pytest
pytest-django
coverage
codecov
dj20: Django>=2.0,<2.1
dj21: Django>=2.1,<2.2
dj21: Django>=2.1,<2.2
dj22: Django>=2.2,<2.3
dj30: Django>=3.0,<3.1
wt20: wagtail>=2.0,<2.1
wt21: wagtail>=2.1,<2.2
wt22: wagtail>=2.2,<2.3
wt23: wagtail>=2.3,<2.4
wt24: wagtail>=2.4,<2.5
wt25: wagtail>=2.5,<2.6
wt26: wagtail>=2.6,<2.7
wt27: wagtail>=2.7,<2.8
wt28: wagtail>=2.8,<2.9

whitelist_externals =
make
Expand Down
2 changes: 1 addition & 1 deletion travis-textract-requirements/python.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ xlrd==1.0.0
EbookLib==0.16
SpeechRecognition==3.7.1
https://github.com/mattgwwalker/msg-extractor/zipball/master
six==1.10.0
six>=1.11,<2.0
pocketsphinx==0.1.3