From d840ab8a7aa25affecd86ff993a135e244f68948 Mon Sep 17 00:00:00 2001
From: Luigi Marini <lmarini@illinois.edu>
Date: Wed, 17 May 2023 15:03:42 -0500
Subject: [PATCH] Create a preview for a text file with the first 1MB text in
 the file.

The simple extractor version doesn't work. Ignore for now. Use the TextPreviewExtractor for now.
---
 text/preview/Dockerfile                  |  8 +++++
 text/preview/extractor_info.json         | 24 +++++++++++++
 text/preview/requirements.txt            |  1 +
 text/preview/simple/Dockerfile           |  5 +++
 text/preview/simple/extractor_info.json  | 24 +++++++++++++
 text/preview/simple/simpletextpreview.py | 37 ++++++++++++++++++++
 text/preview/text_preview_extractor.py   | 43 ++++++++++++++++++++++++
 7 files changed, 142 insertions(+)
 create mode 100644 text/preview/Dockerfile
 create mode 100644 text/preview/extractor_info.json
 create mode 100644 text/preview/requirements.txt
 create mode 100644 text/preview/simple/Dockerfile
 create mode 100644 text/preview/simple/extractor_info.json
 create mode 100644 text/preview/simple/simpletextpreview.py
 create mode 100755 text/preview/text_preview_extractor.py

diff --git a/text/preview/Dockerfile b/text/preview/Dockerfile
new file mode 100644
index 0000000..39f20ed
--- /dev/null
+++ b/text/preview/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.8
+
+WORKDIR /extractor
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+COPY text_preview_extractor.py extractor_info.json ./
+CMD python text_preview_extractor.py
\ No newline at end of file
diff --git a/text/preview/extractor_info.json b/text/preview/extractor_info.json
new file mode 100644
index 0000000..fd0f074
--- /dev/null
+++ b/text/preview/extractor_info.json
@@ -0,0 +1,24 @@
+{
+  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "name": "ncsa.text.preview",
+  "version": "1.0",
+  "description": "Given a text file, keep the first 1MB of text.",
+  "author": "Luigi Marini <lmarini@illinois.edu>",
+  "contributors": [],
+  "contexts": [
+  ],
+  "repository": [
+    {
+      "repType": "git",
+      "repUrl": "https://github.com/clowder-framework/extractors-core.git"
+    }
+  ],
+  "process": {
+    "file": [
+      "text/*"
+    ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": []
+}
diff --git a/text/preview/requirements.txt b/text/preview/requirements.txt
new file mode 100644
index 0000000..136984d
--- /dev/null
+++ b/text/preview/requirements.txt
@@ -0,0 +1 @@
+pyclowder==2.7.0
\ No newline at end of file
diff --git a/text/preview/simple/Dockerfile b/text/preview/simple/Dockerfile
new file mode 100644
index 0000000..0770b57
--- /dev/null
+++ b/text/preview/simple/Dockerfile
@@ -0,0 +1,5 @@
+ARG PYCLOWDER_PYTHON=""
+FROM clowder/extractors-simple-extractor${PYCLOWDER_PYTHON}:onbuild
+
+ENV EXTRACTION_FUNC="simpletextpreview"
+ENV EXTRACTION_MODULE="simpletextpreview"
diff --git a/text/preview/simple/extractor_info.json b/text/preview/simple/extractor_info.json
new file mode 100644
index 0000000..fd0f074
--- /dev/null
+++ b/text/preview/simple/extractor_info.json
@@ -0,0 +1,24 @@
+{
+  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "name": "ncsa.text.preview",
+  "version": "1.0",
+  "description": "Given a text file, keep the first 1MB of text.",
+  "author": "Luigi Marini <lmarini@illinois.edu>",
+  "contributors": [],
+  "contexts": [
+  ],
+  "repository": [
+    {
+      "repType": "git",
+      "repUrl": "https://github.com/clowder-framework/extractors-core.git"
+    }
+  ],
+  "process": {
+    "file": [
+      "text/*"
+    ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": []
+}
diff --git a/text/preview/simple/simpletextpreview.py b/text/preview/simple/simpletextpreview.py
new file mode 100644
index 0000000..5bdc050
--- /dev/null
+++ b/text/preview/simple/simpletextpreview.py
@@ -0,0 +1,37 @@
+"""This version does not currently work. Just leaving it here until I can figure out why not."""
+
+import os
+import tempfile
+
+
+def textpreview(input_file_path):
+    """
+    This function keeps the first 1000 lines of a text files and uploads them as a preview to the file.
+
+    :param input_file_path: Full path to the input file
+    :return: Result dictionary containing the path on disk of the preview
+    """
+    numlines = 10
+
+    with open(input_file_path, 'r') as file:
+        lines = file.readlines(numlines)
+        with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp:
+            for line in lines:
+                print("Line " + line)
+                tmp.write(line)
+
+        # (fd, tmp) = tempfile.mkstemp(suffix='.txt')
+        # with open(tmp, 'w') as f:
+        #     for line in lines:
+        #         print("Line " + line)
+        #         f.write(line)
+        # os.close(fd)
+
+    # return the path of preview on disk
+    print("Preview path " + tmp.name)
+    result = {
+        'previews': [tmp.name]
+    }
+
+    # Return the result dictionary
+    return result
diff --git a/text/preview/text_preview_extractor.py b/text/preview/text_preview_extractor.py
new file mode 100755
index 0000000..67952ff
--- /dev/null
+++ b/text/preview/text_preview_extractor.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+import logging
+import tempfile
+
+import pyclowder
+from pyclowder.extractors import Extractor
+
+
+class TextPreviewExtractor(Extractor):
+
+    def __init__(self):
+        Extractor.__init__(self)
+        self.setup()
+
+        # setup logging for the extractor
+        logging.getLogger('pyclowder').setLevel(logging.DEBUG)
+        logging.getLogger('__main__').setLevel(logging.DEBUG)
+
+    def process_message(self, connector, host, secret_key, resource, parameters):
+        # Process the file and upload the results
+
+        inputfile = resource["local_paths"][0]
+        file_id = resource['id']
+
+        # 1 MB
+        num_bytes = 1000000
+
+        with open(inputfile, 'r') as file:
+            lines = file.readlines(num_bytes)
+            tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, dir='./')
+            tmp.write("Previewing up to the first 1 megabyte of file contents. Download file to see full data.\n---\n")
+            for line in lines:
+                tmp.write(line)
+            tmp.close()
+
+        pyclowder.files.upload_preview(connector, host, secret_key, file_id, tmp.name, None)
+        connector.status_update(pyclowder.utils.StatusMessage.processing, resource,
+                                "Uploaded preview of type txt")
+
+
+if __name__ == "__main__":
+    extractor = TextPreviewExtractor()
+    extractor.start()