pickled VOCABS working

CSIRO-enviro-informatics · Apr 11, 2019 · 03afcac · 03afcac
1 parent 209052a
commit 03afcac
Show file tree

Hide file tree

Showing 15 changed files with 701 additions and 215 deletions.
diff --git a/DATA_SOURCES.md b/DATA_SOURCES.md
@@ -36,10 +36,10 @@ Here you see vocabularies with IDs 'rva-50', 'rva-52', 'tenement_type' & 'Test_R
 
 The controlled list of source types (`VocabSource.FILE`, `VocabSource.VOCBENCH` etc.) are handled by dedicated *source* Python code classes that present a standard set of methods for each type. The files currently implemented, all in the `data/` folder, are:
 
-* `source_RVA.py` - RVA
-* `source_FILE.py` - FILE
-* `source_VOCBENCH.py` - VOCBENCH
+* `RVA.py` - RVA
+* `FILE.py` - FILE
+* `VOCBENCH.py` - VOCBENCH
 
 Additional source files for other vocabulary data sources can be made by creating new `source_*.py` files inheriting from `source.py`.
 
-The specific requirements for each source are contained within their particular files but, summarising the requirements for the sources already catered for, Vocabularies from RVA need to have endpoints specified in the vocab source file `data/source_RVA.py` so VocPrez knows where to get info from. RDF files in `data/` will automatically be picked up by VocPrez so don;t need any more config than a title, provided the ID matched the file name, minus file extension. Vocabs from VocBench require that a `VB_ENDPOINT`, `VB_USER` & `VB_PASSWORD` are all given in the config file.
+The specific requirements for each source are contained within their particular files but, summarising the requirements for the sources already catered for, Vocabularies from RVA need to have endpoints specified in the vocab source file `data/RVA.py` so VocPrez knows where to get info from. RDF files in `data/` will automatically be picked up by VocPrez so don;t need any more config than a title, provided the ID matched the file name, minus file extension. Vocabs from VocBench require that a `VB_ENDPOINT`, `VB_USER` & `VB_PASSWORD` are all given in the config file.
diff --git a/_config/template.py b/_config/template.py
@@ -1,8 +1,6 @@
 from os import path
-from data.source_FILE import FILE
-from data.source_RVA import RVA
+
 # RVA doesnt need to be imported as it's list_vocabularies method isn't used- vocabs from that are statically listed
-from data.source_VOCBENCH import VOCBENCH
 
 APP_DIR = path.dirname(path.dirname(path.realpath(__file__)))
 TEMPLATES_DIR = path.join(APP_DIR, 'view', 'templates')

diff --git a/app.py b/app.py
@@ -1,26 +1,49 @@
 import logging
-import _config
-from flask import Flask
+import _config as config
+from flask import Flask, g
 from controller import routes
 import helper
-from data.source_FILE import FILE
-from data.source_RVA import RVA
-from data.source_VOCBENCH import VOCBENCH
+import data.source as source
+import os
+import pickle
 
-app = Flask(__name__, template_folder=_config.TEMPLATES_DIR, static_folder=_config.STATIC_DIR)
+app = Flask(__name__, template_folder=config.TEMPLATES_DIR, static_folder=config.STATIC_DIR)
 
 app.register_blueprint(routes.routes)
 
 
-@app.before_first_request
-def start_up_tasks():
-    VOCBENCH.init()
-    RVA.init()
-    FILE.init()
-    # extend this instances' list of vocabs by using the known sources
-    VOCABS = {**_config.VOCABS, **FILE.list_vocabularies()}  # picks up all vocab RDF (turtle) files in data/
-    # VOCABS = {**VOCABS, **VOCBENCH.list_vocabularies()}  # picks up all vocabs at the relevant VocBench instance
-    print('Finished startup tasks.')
+@app.before_request
+def before_request():
+    """
+    Runs before every request and populates vocab index either from disk (VOCABS.p) or from a complete reload by
+    calling collect() for each of the vocab sources defined in config/__init__.py -> VOCAB_SOURCES
+    :return: nothing
+    """
+    # check to see if g.VOCABS exists, if so, do nothing
+    if hasattr(g, 'VOCABS'):
+        return
+
+    # we have no g.VOCABS so try and load it from a pickled VOCABS.p file
+    vocabs_file_path = os.path.join(config.APP_DIR, 'VOCABS.p')
+    if os.path.isfile(vocabs_file_path):
+        with open(vocabs_file_path, 'rb') as f:
+            g.VOCABS = pickle.load(f)
+            f.close()
+        return
+
+    # we haven't been able to load from VOCABS.p so run collect() on each vocab source to recreate it
+
+    # check each vocab source and,
+    # using the appropriate class (from details['source']),
+    # load all the vocabs from it into this session's (g) VOCABS variable
+    g.VOCABS = {}
+    for name, details in config.VOCAB_SOURCES.items():
+        getattr(source, details['source']).collect(details)
+
+    # also load all vocabs into VOCABS.p on disk for future use
+    with open(vocabs_file_path, 'wb') as f:
+        pickle.dump(g.VOCABS, f)
+        f.close()
 
 
 @app.context_processor
@@ -35,9 +58,9 @@ def context_processor():
 
 # run the Flask app
 if __name__ == '__main__':
-    logging.basicConfig(filename=_config.LOGFILE,
+    logging.basicConfig(filename=config.LOGFILE,
                         level=logging.DEBUG,
                         datefmt='%Y-%m-%d %H:%M:%S',
                         format='%(asctime)s %(levelname)s %(filename)s:%(lineno)s %(message)s')
 
-    app.run(debug=_config.DEBUG, threaded=True)
+    app.run(debug=config.DEBUG, threaded=True)
diff --git a/controller/routes.py b/controller/routes.py
@@ -1,25 +1,24 @@
-from flask import Blueprint, Response, request, render_template
+from flask import Blueprint, Response, request, render_template, Markup, g
 from model.vocabulary import VocabularyRenderer
 from model.concept import ConceptRenderer
 from model.collection import CollectionRenderer
 from model.skos_register import SkosRegisterRenderer
 import _config as config
 import markdown
-from flask import Markup
-from data.source import Source
-from data.source_VOCBENCH import VbException
+from data.source._source import Source
+from data.source.VOCBENCH import VbException
 import json
 
 routes = Blueprint('routes', __name__)
 
 
 def render_invalid_vocab_id_response():
-    msg = """The vocabulary ID that was supplied was not known. It must be one of these: \n\n* """ + '\n* '.join(config.VOCABS.keys())
+    msg = """The vocabulary ID that was supplied was not known. It must be one of these: \n\n* """ + '\n* '.join(g.VOCABS.keys())
     msg = Markup(markdown.markdown(msg))
     return render_template('error.html', title='Error - invalid vocab id', heading='Invalid Vocab ID', msg=msg)
     # return Response(
     #     'The vocabulary ID you\'ve supplied is not known. Must be one of:\n ' +
-    #     '\n'.join(config.VOCABS.keys()),
+    #     '\n'.join(g.VOCABS.keys()),
     #     status=400,
     #     mimetype='text/plain'
     # )
@@ -43,14 +42,14 @@ def render_invalid_object_class_response(vocab_id, uri, c_type):
     return render_template('error.html', title='Error - Object Class URI', heading='Concept Class Type Error', msg=msg)
 
 
-def get_a_vocab_source_key():
+def get_a_vocab_key():
     """
-    Get the first key from the config.VOCABS dictionary.
+    Get the first key from the g.VOCABS dictionary.
 
     :return: Key name
     :rtype: str
     """
-    return next(iter(config.VOCABS))
+    return next(iter(g.VOCABS))
 
 
 @routes.route('/')
@@ -64,6 +63,16 @@ def index():
     )
 
 
+def get_a_vocab_source_key():
+    """
+    Get the first key from the config.VOCABS dictionary.
+
+    :return: Key name
+    :rtype: str
+    """
+    return next(iter(g.VOCABS))
+
+
 def match(vocabs, query):
     """
     Generate a generator of vocabulary items that match the search query
@@ -84,15 +93,15 @@ def vocabularies():
     per_page = int(request.values.get('per_page')) if request.values.get('per_page') is not None else 20
 
     # TODO: replace this logic with the following
-    #   1. read all static vocabs from config.VOCABS
+    #   1. read all static vocabs from g.VOCABS
     # get this instance's list of vocabs
-    vocabs = []
-    for k, v in config.VOCABS.items():
+    vocabs = []  # local copy (to this request) for sorting
+    for k, v in g.VOCABS.items():
         v['vocab_id'] = k
         v['uri'] = request.base_url + k
         vocabs.append(v)
     vocabs.sort(key=lambda item: item['title'])
-    total = len(config.VOCABS.items())
+    total = len(g.VOCABS.items())
 
     # Search
     query = request.values.get('search')
@@ -124,7 +133,7 @@ def vocabularies():
 
 @routes.route('/vocabulary/<vocab_id>')
 def vocabulary(vocab_id):
-    if vocab_id not in config.VOCABS.keys():
+    if vocab_id not in g.VOCABS.keys():
         return render_invalid_vocab_id_response()
 
     # get vocab details using appropriate source handler
@@ -141,7 +150,7 @@ def vocabulary(vocab_id):
 
 @routes.route('/vocabulary/<vocab_id>/concept/')
 def vocabulary_list(vocab_id):
-    if vocab_id not in config.VOCABS.keys():
+    if vocab_id not in g.VOCABS.keys():
         return render_invalid_vocab_id_response()
 
     v = Source(vocab_id, request)
@@ -169,7 +178,7 @@ def vocabulary_list(vocab_id):
         request,
         [],
         concepts,
-        config.VOCABS[vocab_id]['title'] + ' concepts',
+        g.VOCABS[vocab_id]['title'] + ' concepts',
         total,
         search_query=query,
         search_enabled=True,
@@ -206,10 +215,10 @@ def object():
     uri = request.values.get('uri')
 
     # check this vocab ID is known
-    if vocab_id not in config.VOCABS.keys():
+    if vocab_id not in g.VOCABS.keys():
         return Response(
             'The vocabulary ID you\'ve supplied is not known. Must be one of:\n ' +
-            '\n'.join(config.VOCABS.keys()),
+            '\n'.join(g.VOCABS.keys()),
             status=400,
             mimetype='text/plain'
         )
@@ -264,3 +273,21 @@ def about():
         navs={},
         content=content
     )
+
+
+@routes.route('/test')
+def test():
+    txt = ''
+    # for vocab_id, details in g.VOCABS.items():
+    #     txt = txt + '{}: {}\n'.format(vocab_id, details['title'])
+
+    import os
+    import pickle
+    import pprint
+    vocabs_file_path = os.path.join(config.APP_DIR, 'VOCABS.p')
+    if os.path.isfile(vocabs_file_path):
+        with open(vocabs_file_path, 'rb') as f:
+            txt = str(pickle.load(f))
+            f.close()
+
+    return Response(txt, mimetype='text/plain')
diff --git a/data/source_FILE.py → data/source/FILE.py b/data/source_FILE.py → data/source/FILE.py
@@ -1,8 +1,8 @@
-from data.source import Source
-from os.path import dirname, realpath, join, abspath
+from data.source._source import Source
+from os.path import join
 import _config as config
 from rdflib import Graph, URIRef, RDF
-from rdflib.namespace import SKOS, DCTERMS, DC, OWL
+from rdflib.namespace import SKOS, DCTERMS, OWL
 import os
 import pickle
 from helper import APP_DIR, make_title
@@ -46,9 +46,9 @@ def init():
                         f.close()
 
         # Get register item metadata
-        for vocab_id in config.VOCABS:
-            if vocab_id in config.VOCABS:
-                if config.VOCABS[vocab_id]['source'] != config.VocabSource.FILE:
+        for vocab_id in g.VOCABS:
+            if vocab_id in g.VOCABS:
+                if g.VOCABS[vocab_id]['source'] != config.VocabSource.FILE:
                     continue
 
                 # Creators
@@ -58,7 +58,7 @@ def init():
                     for creator in g.objects(uri, DCTERMS.creator):
                         creators.append(str(creator))
                     break
-                config.VOCABS[vocab_id]['creators'] = creators
+                g.VOCABS[vocab_id]['creators'] = creators
 
                 # Date Created
                 date_created = None
@@ -71,22 +71,21 @@ def init():
                     for uri in g.subjects(RDF.type, SKOS.ConceptScheme):
                         for date in g.objects(uri, DCTERMS.date):
                             date_created = str(date)[:10]
-                config.VOCABS[vocab_id]['date_created'] = date_created
+                g.VOCABS[vocab_id]['date_created'] = date_created
 
                 # Date Modified
                 date_modified = None
                 for uri in g.subjects(RDF.type, SKOS.ConceptScheme):
                     for date in g.objects(uri, DCTERMS.modified):
                         date_modified = str(date)[:10]
-                config.VOCABS[vocab_id]['date_modified'] = date_modified
+                g.VOCABS[vocab_id]['date_modified'] = date_modified
 
                 # Version
                 version = None
                 for uri in g.subjects(RDF.type, SKOS.ConceptScheme):
                     for versionInfo in g.objects(uri, OWL.versionInfo):
                         version = versionInfo
-                config.VOCABS[vocab_id]['version'] = version
-
+                g.VOCABS[vocab_id]['version'] = version
 
     @classmethod
     def list_vocabularies(self):
@@ -103,8 +102,8 @@ def list_vocabularies(self):
         # TODO: Move this to list_concepts() method
         # list concepts
         vocabs = {}
-        # for v in config.VOCABS:
-        #     if config.VOCABS[v]['source'] == config.VocabSource.FILE:
+        # for v in g.VOCABS:
+        #     if g.VOCABS[v]['source'] == config.VocabSource.FILE:
         #         g = FILE.load_pickle(v)
         #         for s, p, o in g.triples((None, SKOS.inScheme, None)):
         #             if s not in vocabs:
@@ -267,8 +266,8 @@ def get_collection(self, uri):
         pass
 
     def get_concept(self, uri):
-        if config.VOCABS[self.vocab_id].get('turtle'):
-            g = Graph().parse(config.VOCABS[self.vocab_id]['turtle'])
+        if g.VOCABS[self.vocab_id].get('turtle'):
+            g = Graph().parse(g.VOCABS[self.vocab_id]['turtle'])
         else:
             g = Graph().parse(os.path.join(APP_DIR, 'vocab_files', self.vocab_id + '.ttl'), format='turtle')
 
@@ -523,8 +522,8 @@ def build_concept_hierarchy(vocab_id):
             raise Exception('topConcept not found')
 
     def get_object_class(self, uri):
-        if config.VOCABS[self.vocab_id].get('turtle'):
-            g = Graph().parse(config.VOCABS[self.vocab_id]['turtle'], format='turtle')
+        if g.VOCABS[self.vocab_id].get('turtle'):
+            g = Graph().parse(g.VOCABS[self.vocab_id]['turtle'], format='turtle')
         else:
             g = Graph().parse(os.path.join(APP_DIR, 'vocab_files', self.vocab_id + '.ttl'), format='turtle')
         for s, p, o in g.triples((URIRef(uri), RDF.type, SKOS.Concept)):

diff --git a/data/source_GITHUB.py → data/source/GITHUB.py b/data/source_GITHUB.py → data/source/GITHUB.py
@@ -1,4 +1,4 @@
-from data.source import Source
+from data.source._source import Source
 from os.path import dirname, realpath, join, abspath
 import _config as config
 from rdflib import Graph