Skip to content

Create a preview for a text file with the first 1MB text in the file. #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions text/preview/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.8

WORKDIR /extractor
COPY requirements.txt ./
RUN pip install -r requirements.txt

COPY text_preview_extractor.py extractor_info.json ./
CMD python text_preview_extractor.py
24 changes: 24 additions & 0 deletions text/preview/extractor_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
"name": "ncsa.text.preview",
"version": "1.0",
"description": "Given a text file, keep the first 1MB of text.",
"author": "Luigi Marini <[email protected]>",
"contributors": [],
"contexts": [
],
"repository": [
{
"repType": "git",
"repUrl": "https://github.com/clowder-framework/extractors-core.git"
}
],
"process": {
"file": [
"text/*"
]
},
"external_services": [],
"dependencies": [],
"bibtex": []
}
1 change: 1 addition & 0 deletions text/preview/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyclowder==2.7.0
5 changes: 5 additions & 0 deletions text/preview/simple/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ARG PYCLOWDER_PYTHON=""
FROM clowder/extractors-simple-extractor${PYCLOWDER_PYTHON}:onbuild

ENV EXTRACTION_FUNC="simpletextpreview"
ENV EXTRACTION_MODULE="simpletextpreview"
24 changes: 24 additions & 0 deletions text/preview/simple/extractor_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
"name": "ncsa.text.preview",
"version": "1.0",
"description": "Given a text file, keep the first 1MB of text.",
"author": "Luigi Marini <[email protected]>",
"contributors": [],
"contexts": [
],
"repository": [
{
"repType": "git",
"repUrl": "https://github.com/clowder-framework/extractors-core.git"
}
],
"process": {
"file": [
"text/*"
]
},
"external_services": [],
"dependencies": [],
"bibtex": []
}
37 changes: 37 additions & 0 deletions text/preview/simple/simpletextpreview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""This version does not currently work. Just leaving it here until I can figure out why not."""

import os
import tempfile


def textpreview(input_file_path):
"""
This function keeps the first 1000 lines of a text files and uploads them as a preview to the file.

:param input_file_path: Full path to the input file
:return: Result dictionary containing the path on disk of the preview
"""
numlines = 10

with open(input_file_path, 'r') as file:
lines = file.readlines(numlines)
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp:
for line in lines:
print("Line " + line)
tmp.write(line)

# (fd, tmp) = tempfile.mkstemp(suffix='.txt')
# with open(tmp, 'w') as f:
# for line in lines:
# print("Line " + line)
# f.write(line)
# os.close(fd)

# return the path of preview on disk
print("Preview path " + tmp.name)
result = {
'previews': [tmp.name]
}

# Return the result dictionary
return result
43 changes: 43 additions & 0 deletions text/preview/text_preview_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python
import logging
import tempfile

import pyclowder
from pyclowder.extractors import Extractor


class TextPreviewExtractor(Extractor):

def __init__(self):
Extractor.__init__(self)
self.setup()

# setup logging for the extractor
logging.getLogger('pyclowder').setLevel(logging.DEBUG)
logging.getLogger('__main__').setLevel(logging.DEBUG)

def process_message(self, connector, host, secret_key, resource, parameters):
# Process the file and upload the results

inputfile = resource["local_paths"][0]
file_id = resource['id']

# 1 MB
num_bytes = 1000000

with open(inputfile, 'r') as file:
lines = file.readlines(num_bytes)
tmp = tempfile.NamedTemporaryFile(suffix=".txt", mode="w", delete=False, dir='./')
tmp.write("Previewing up to the first 1 megabyte of file contents. Download file to see full data.\n---\n")
for line in lines:
tmp.write(line)
tmp.close()

pyclowder.files.upload_preview(connector, host, secret_key, file_id, tmp.name, None)
connector.status_update(pyclowder.utils.StatusMessage.processing, resource,
"Uploaded preview of type txt")


if __name__ == "__main__":
extractor = TextPreviewExtractor()
extractor.start()