From d840ab8a7aa25affecd86ff993a135e244f68948 Mon Sep 17 00:00:00 2001 From: Luigi Marini Date: Wed, 17 May 2023 15:03:42 -0500 Subject: [PATCH] Create a preview for a text file with the first 1MB text in the file. The simple extractor version doesn't work. Ignore for now. Use the TextPreviewExtractor for now. --- text/preview/Dockerfile | 8 +++++ text/preview/extractor_info.json | 24 +++++++++++++ text/preview/requirements.txt | 1 + text/preview/simple/Dockerfile | 5 +++ text/preview/simple/extractor_info.json | 24 +++++++++++++ text/preview/simple/simpletextpreview.py | 37 ++++++++++++++++++++ text/preview/text_preview_extractor.py | 43 ++++++++++++++++++++++++ 7 files changed, 142 insertions(+) create mode 100644 text/preview/Dockerfile create mode 100644 text/preview/extractor_info.json create mode 100644 text/preview/requirements.txt create mode 100644 text/preview/simple/Dockerfile create mode 100644 text/preview/simple/extractor_info.json create mode 100644 text/preview/simple/simpletextpreview.py create mode 100755 text/preview/text_preview_extractor.py diff --git a/text/preview/Dockerfile b/text/preview/Dockerfile new file mode 100644 index 0000000..39f20ed --- /dev/null +++ b/text/preview/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.8 + +WORKDIR /extractor +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +COPY text_preview_extractor.py extractor_info.json ./ +CMD python text_preview_extractor.py \ No newline at end of file diff --git a/text/preview/extractor_info.json b/text/preview/extractor_info.json new file mode 100644 index 0000000..fd0f074 --- /dev/null +++ b/text/preview/extractor_info.json @@ -0,0 +1,24 @@ +{ + "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", + "name": "ncsa.text.preview", + "version": "1.0", + "description": "Given a text file, keep the first 1MB of text.", + "author": "Luigi Marini ", + "contributors": [], + "contexts": [ + ], + "repository": [ + { + "repType": "git", + "repUrl": "https://github.com/clowder-framework/extractors-core.git" + } + ], + "process": { + "file": [ + "text/*" + ] + }, + "external_services": [], + "dependencies": [], + "bibtex": [] +} diff --git a/text/preview/requirements.txt b/text/preview/requirements.txt new file mode 100644 index 0000000..136984d --- /dev/null +++ b/text/preview/requirements.txt @@ -0,0 +1 @@ +pyclowder==2.7.0 \ No newline at end of file diff --git a/text/preview/simple/Dockerfile b/text/preview/simple/Dockerfile new file mode 100644 index 0000000..0770b57 --- /dev/null +++ b/text/preview/simple/Dockerfile @@ -0,0 +1,5 @@ +ARG PYCLOWDER_PYTHON="" +FROM clowder/extractors-simple-extractor${PYCLOWDER_PYTHON}:onbuild + +ENV EXTRACTION_FUNC="simpletextpreview" +ENV EXTRACTION_MODULE="simpletextpreview" diff --git a/text/preview/simple/extractor_info.json b/text/preview/simple/extractor_info.json new file mode 100644 index 0000000..fd0f074 --- /dev/null +++ b/text/preview/simple/extractor_info.json @@ -0,0 +1,24 @@ +{ + "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", + "name": "ncsa.text.preview", + "version": "1.0", + "description": "Given a text file, keep the first 1MB of text.", + "author": "Luigi Marini ", + "contributors": [], + "contexts": [ + ], + "repository": [ + { + "repType": "git", + "repUrl": "https://github.com/clowder-framework/extractors-core.git" + } + ], + "process": { + "file": [ + "text/*" + ] + }, + "external_services": [], + "dependencies": [], + "bibtex": [] +} diff --git a/text/preview/simple/simpletextpreview.py b/text/preview/simple/simpletextpreview.py new file mode 100644 index 0000000..5bdc050 --- /dev/null +++ b/text/preview/simple/simpletextpreview.py @@ -0,0 +1,37 @@ +"""This version does not currently work. Just leaving it here until I can figure out why not.""" + +import os +import tempfile + + +def textpreview(input_file_path): + """ + This function keeps the first 1000 lines of a text files and uploads them as a preview to the file. + + :param input_file_path: Full path to the input file + :return: Result dictionary containing the path on disk of the preview + """ + numlines = 10 + + with open(input_file_path, 'r') as file: + lines = file.readlines(numlines) + with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp: + for line in lines: + print("Line " + line) + tmp.write(line) + + # (fd, tmp) = tempfile.mkstemp(suffix='.txt') + # with open(tmp, 'w') as f: + # for line in lines: + # print("Line " + line) + # f.write(line) + # os.close(fd) + + # return the path of preview on disk + print("Preview path " + tmp.name) + result = { + 'previews': [tmp.name] + } + + # Return the result dictionary + return result diff --git a/text/preview/text_preview_extractor.py b/text/preview/text_preview_extractor.py new file mode 100755 index 0000000..67952ff --- /dev/null +++ b/text/preview/text_preview_extractor.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +import logging +import tempfile + +import pyclowder +from pyclowder.extractors import Extractor + + +class TextPreviewExtractor(Extractor): + + def __init__(self): + Extractor.__init__(self) + self.setup() + + # setup logging for the extractor + logging.getLogger('pyclowder').setLevel(logging.DEBUG) + logging.getLogger('__main__').setLevel(logging.DEBUG) + + def process_message(self, connector, host, secret_key, resource, parameters): + # Process the file and upload the results + + inputfile = resource["local_paths"][0] + file_id = resource['id'] + + # 1 MB + num_bytes = 1000000 + + with open(inputfile, 'r') as file: + lines = file.readlines(num_bytes) + tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, dir='./') + tmp.write("Previewing up to the first 1 megabyte of file contents. Download file to see full data.\n---\n") + for line in lines: + tmp.write(line) + tmp.close() + + pyclowder.files.upload_preview(connector, host, secret_key, file_id, tmp.name, None) + connector.status_update(pyclowder.utils.StatusMessage.processing, resource, + "Uploaded preview of type txt") + + +if __name__ == "__main__": + extractor = TextPreviewExtractor() + extractor.start()