pageuppeople-opensource
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 19 additions & 9 deletions b/‎README.md‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎modules/DataPipelineExecutionRepository.py‎
Lines changed: 0 additions & 40 deletions b/‎modules/DataPipelineExecutionRepository.py‎
Lines changed: 0 additions & 40 deletions
diff --git a/‎modules/DataRepository.py‎
Lines changed: 75 additions & 0 deletions b/‎modules/DataRepository.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎modules/ModelChangeDetector.py‎
Lines changed: 50 additions & 26 deletions b/‎modules/ModelChangeDetector.py‎
Lines changed: 50 additions & 26 deletions
diff --git a/‎modules/Shared.py‎
Lines changed: 5 additions & 2 deletions b/‎modules/Shared.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎modules/commands/BaseCommand.py‎
Lines changed: 3 additions & 3 deletions b/‎modules/commands/BaseCommand.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎modules/commands/CompareCommand.py‎
Lines changed: 51 additions & 0 deletions b/‎modules/commands/CompareCommand.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎modules/commands/CompleteCommand.py‎
Lines changed: 13 additions & 0 deletions b/‎modules/commands/CompleteCommand.py‎
Lines changed: 13 additions & 0 deletions
@@ -107,4 +107,6 @@ venv.bak/
 .vscode/
 
 # JetBrains
-.idea/
+.idea/
+
+test-models/
@@ -13,16 +13,20 @@ py mcd.py [options] <command> [command-parameters]
 - `options` include:
   - `--help | -h`: displays help menu.
   - `--log-level | -l`: choose program's logging level, from CRITICAL, ERROR, WARNING, INFO, DEBUG; default is INFO.
+- `db-connection-string`: a [PostgreSQL Db Connection String](http://docs.sqlalchemy.org/en/latest/dialects/postgresql.html#module-sqlalchemy.dialects.postgresql.psycopg2) of the format `postgresql+psycopg2://user:password@host:port/dbname`
 - `command` is the function to be performed by the utility. The currently supported values are:
-  - `start`: Marks the start of a new execution by creating a record for the same in the given database. Returns an `execution-id` which is a GUID identifier of the new execution.
-    - `db-connection-string`: a [PostgreSQL Db Connection String](http://docs.sqlalchemy.org/en/latest/dialects/postgresql.html#module-sqlalchemy.dialects.postgresql.psycopg2) of the format `postgresql+psycopg2://user:password@host:port/dbname`
-  - `finish`: Marks the completion of an existing execution by updating a record for the same in the given database. Returns nothing unless there's an error.
-    - `db-connection-string`: a [PostgreSQL Db Connection String](http://docs.sqlalchemy.org/en/latest/dialects/postgresql.html#module-sqlalchemy.dialects.postgresql.psycopg2) of the format `postgresql+psycopg2://user:password@host:port/dbname`
-    - `execution-id`: a GUID identifier of an existing data pipeline execution.
+  - `init`: Marks the start of a new execution by creating a record for the same in the given database. Returns an `execution-id` which is a GUID identifier of the new execution.
+  - `compare`: Compares & persists SHA256-hashed checksums of the given models against those of the last successful execution. Returns comma-separated string of changed model names.
+    - `execution-id`: a GUID identifier of an existing data pipeline execution as returned by the `init` command.
+    - `model-type`: type of models being processed e.g.: `load`, `transform`, etc. this `model-type` is used to group the model checksums by and used to find and compare older ones.
+    - `base-path`: absolute or relative path to the models e.g.: `./load`, `/home/local/load`, `C:/path/to/load`
+    - `model-patterns`: path-based patterns _(relative to `base-path`)_ to different models with extensions. models within a model-type must be named uniquely regardless of their file extension. e.g.: `*.txt`, `**/*.txt`, `./relative/path/to/some_models/**/*.csv`, `relative/path/to/some/more/related/models/**/*.sql`
+  - `complete`: Marks the completion of an existing execution by updating a record for the same in the given database. Returns nothing unless there's an error.
+    - `execution-id`: a GUID identifier of an existing data pipeline execution as returned by the `init` command.
 
 To get help,use:
 
-```
+```commandline
 py mcd.py --help
 py mcd.py <command> --help
 ```
@@ -41,7 +45,10 @@ new-env\scripts\activate
 
 py -m pip install -r requirements.txt
 
-py mcd.py start postgresql+psycopg2://user:password@host:port/dbname
+py mcd.py postgresql+psycopg2://user:password@host:port/dbname init
+py mcd.py postgresql+psycopg2://user:password@host:port/dbname compare execution-id-as-retured-by-init-command load ./relative/path/to/load/models **/*.json
+py mcd.py postgresql+psycopg2://user:password@host:port/dbname compare execution-id-as-retured-by-init-command transform C:/absolute/path/to/transform/models group1/*.csv ./group2/**/*.sql
+py mcd.py postgresql+psycopg2://user:password@host:port/dbname complete execution-id-as-retured-by-init-command
 ```
 
 ### As a package
@@ -64,7 +71,10 @@ new-env\scripts\activate
 
 pip install -e git+git://github.com/PageUpPeopleOrg/model-change-detector.git#egg=mcd
 
-py -m mcd start postgresql+psycopg2://user:password@host:port/dbname
+py -m mcd postgresql+psycopg2://user:password@host:port/dbname init
+py -m mcd postgresql+psycopg2://user:password@host:port/dbname compare execution-id-as-retured-by-init-command load ./relative/path/to/load/models **/*.json
+py -m mcd postgresql+psycopg2://user:password@host:port/dbname compare execution-id-as-retured-by-init-command transform C:/absolute/path/to/transform/models group1/*.csv ./group2/**/*.sql
+py -m mcd postgresql+psycopg2://user:password@host:port/dbname complete execution-id-as-retured-by-init-command
 ```
 
 ## Setup
@@ -113,7 +123,7 @@ On Linux / Mac OS
 
 You should see the name of your virtual environment in brackets on your terminal line, e.g.:
 
-```
+```commandline
 C:\path\to\working\dir: new-env\scripts\activate
 (new-env) C:\path\to\working\dir: _
 ```
 
@@ -0,0 +1,75 @@
+from sqlalchemy import desc
+
+from modules import Shared
+from modules.BaseObject import BaseObject
+from modules.Shared import Constants
+from modules.entities.DataPipelineExecutionEntity import DataPipelineExecutionEntity
+from modules.entities.ModelChecksumEntity import ModelChecksumEntity
+
+
+class DataRepository(BaseObject):
+    def __init__(self, session_maker, logger=None):
+        super().__init__(logger)
+        self.session_maker = session_maker
+
+    def ensure_schema_exists(self, engine):
+        engine.execute(f'CREATE SCHEMA IF NOT EXISTS {Constants.DATA_PIPELINE_EXECUTION_SCHEMA_NAME}')
+        Shared.BaseEntity.metadata.create_all(engine)
+
+    def initialise_execution(self):
+        session = self.session_maker()
+
+        data_pipeline_execution = DataPipelineExecutionEntity()
+        session.add(data_pipeline_execution)
+
+        session.commit()
+        return data_pipeline_execution
+
+    def get_last_successful_models(self, model_type):
+        last_successful_models = {}
+        session = self.session_maker()
+
+        last_successful_execution = session.query(DataPipelineExecutionEntity) \
+            .filter_by(status=Constants.DataPipelineExecutionStatus.COMPLETED) \
+            .order_by(desc(DataPipelineExecutionEntity.last_updated_on)) \
+            .order_by(desc(DataPipelineExecutionEntity.created_on)) \
+            .first()
+
+        if last_successful_execution is None:
+            return last_successful_models
+
+        previous_model_checksums = session.query(ModelChecksumEntity) \
+            .filter_by(execution_id=last_successful_execution.id, type=model_type)
+
+        for model_checksum_entity in previous_model_checksums:
+            last_successful_models[model_checksum_entity.name] = model_checksum_entity.checksum
+
+        return last_successful_models
+
+    def save_execution_progress(self, execution_id, model_type, model_checksums):
+        session = self.session_maker()
+
+        data_pipeline_execution = session.query(DataPipelineExecutionEntity) \
+            .filter_by(id=execution_id) \
+            .one()
+
+        for model, checksum in sorted(model_checksums.items()):
+            model_checksum_entity = ModelChecksumEntity(execution_id=data_pipeline_execution.id,
+                                                        type=model_type,
+                                                        name=model,
+                                                        checksum=checksum)
+            session.add(model_checksum_entity)
+
+        session.commit()
+        return data_pipeline_execution
+
+    def complete_execution(self, execution_id):
+        session = self.session_maker()
+
+        data_pipeline_execution = session.query(DataPipelineExecutionEntity) \
+            .filter_by(id=execution_id) \
+            .one()
+        data_pipeline_execution.status = Constants.DataPipelineExecutionStatus.COMPLETED
+
+        session.commit()
+        return data_pipeline_execution
@@ -1,10 +1,12 @@
 import argparse
 import logging
+
 from modules import Shared
-from modules.Shared import Constants
 from modules.BaseObject import BaseObject
-from modules.commands.StartCommand import StartCommand
-from modules.commands.FinishCommand import FinishCommand
+from modules.Shared import Constants
+from modules.commands.CompareCommand import CompareCommand
+from modules.commands.CompleteCommand import CompleteCommand
+from modules.commands.InitialiseCommand import InitialiseCommand
 
 
 class ModelChangeDetector(BaseObject):
@@ -19,41 +21,63 @@ def __init__(self, logger=None):
 
         self.args.func()
 
-    def __process_start_command(self):
-        StartCommand(self.args.db_connection_string).execute()
+    def __process_init_command(self):
+        InitialiseCommand(self.args.db_connection_string).execute()
+
+    def __process_compare_command(self):
+        CompareCommand(self.args.db_connection_string, self.args.execution_id, self.args.model_type,
+                       self.args.base_path, self.args.model_patterns).execute()
 
-    def __process_finish_command(self):
-        FinishCommand(self.args.db_connection_string, self.args.execution_id).execute()
+    def __process_complete_command(self):
+        CompleteCommand(self.args.db_connection_string, self.args.execution_id).execute()
 
     def __get_arguments(self):
         parser = argparse.ArgumentParser(description=Constants.APP_NAME,
-                                         usage='mcd [options] <command> [command-parameters]\n\n'
+                                         usage='mcd [options] <db-connection-string> <command> [command-parameters]\n\n'
                                                'To see help text, you can run\n'
                                                '  mcd --help\n'
-                                               '  mcd <command> --help',
+                                               '  mcd <db-connection-string> <command> --help\n\n',
                                          parents=[Shared.get_default_arguments()])
 
+        parser.add_argument('db_connection_string',
+                            metavar='db-connection-string',
+                            help='provide in PostgreSQL & Psycopg format, '
+                                 'postgresql+psycopg2://username:password@host:port/dbname')
+
         subparsers = parser.add_subparsers(title='commands', metavar='', dest='command')
 
-        start_command_parser = subparsers.add_parser('start', help='help text for \'start\' command')
-        start_command_parser.set_defaults(func=self.__process_start_command)
-        self.__get_default_command_arguments(start_command_parser)
+        init_command_parser = subparsers.add_parser('init', help='initialises a new data pipeline execution')
+        init_command_parser.set_defaults(func=self.__process_init_command)
+
+        compare_command_parser = subparsers.add_parser('compare', help='compares given models with those of the last '
+                                                                       'successfully processed data pipeline '
+                                                                       'execution. also persists given models against '
+                                                                       'the given data pipeline execution.')
+        compare_command_parser.set_defaults(func=self.__process_compare_command)
+        compare_command_parser.add_argument('execution_id',
+                                            metavar='execution-id',
+                                            help='data pipeline execution id as received using \'init\' command')
+        compare_command_parser.add_argument('model_type',
+                                            metavar='model-type',
+                                            help='a string name for the type of models to compare. used to group '
+                                                 'models between various calls to this command for same data pipeline '
+                                                 'execution. e.g. load, transform')
+        compare_command_parser.add_argument('base_path',
+                                            metavar='base-path',
+                                            help='absolute or relative path to the base directory of all models')
+        compare_command_parser.add_argument('model_patterns',
+                                            metavar='model-patterns',
+                                            nargs='+',
+                                            help='one or more unix-style search patterns for model files. e.g.: '
+                                                 '*.txt, **/*.json, ./path/to/some_models/**/*.csv, '
+                                                 'path/to/some/more/related/models/**/*.sql')
 
-        finish_command_parser = subparsers.add_parser('finish', help='help text for \'finish\' command')
-        finish_command_parser.set_defaults(func=self.__process_finish_command)
-        self.__get_default_command_arguments(finish_command_parser)
-        finish_command_parser.add_argument('execution_id',
-                                           metavar='execution_id',
-                                           help='data pipeline execution id as received using \'start\' command')
+        complete_command_parser = subparsers.add_parser('complete', help='completees the given data pipeline execution.')
+        complete_command_parser.set_defaults(func=self.__process_complete_command)
+        complete_command_parser.add_argument('execution_id',
+                                           metavar='execution-id',
+                                           help='data pipeline execution id as received using \'init\' command')
 
         args = parser.parse_args()
 
         return args
-
-    @staticmethod
-    def __get_default_command_arguments(command_parser):
-        command_parser.add_argument('db_connection_string',
-                                    metavar='db-connection-string',
-                                    help='provide in PostgreSQL & Psycopg format, '
-                                         'postgresql+psycopg2://username:password@host:port/dbname')
-
 
@@ -1,14 +1,17 @@
 import logging
 import argparse
+from sqlalchemy.ext.declarative import declarative_base
+
+BaseEntity = declarative_base()
 
 
 class Constants:
     APP_NAME = 'model-change-detector'
     DATA_PIPELINE_EXECUTION_SCHEMA_NAME = 'data_pipeline'
 
     class DataPipelineExecutionStatus:
-        STARTED = 'STARTED'
-        COMPLETED_SUCCESSFULLY = 'SUCCESSFUL'
+        INITIALISED = 'INITIALISED'
+        COMPLETED = 'COMPLETED'
 
 
 _logLevelStrings = [logging.getLevelName(logging.CRITICAL),
 
@@ -1,6 +1,6 @@
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
-from modules.DataPipelineExecutionRepository import DataPipelineExecutionRepository
+from modules.DataRepository import DataRepository
 from modules.BaseObject import BaseObject
 
 
@@ -9,5 +9,5 @@ def __init__(self, db_connection_string, logger=None):
         super().__init__(logger)
         self.db_engine = create_engine(db_connection_string, echo=False)
         self.session_maker = sessionmaker(bind=self.db_engine)
-        self.repository = DataPipelineExecutionRepository(self.session_maker)
-        self.repository.create_schema(engine=self.db_engine)
+        self.repository = DataRepository(self.session_maker)
+        self.repository.ensure_schema_exists(engine=self.db_engine)
@@ -0,0 +1,51 @@
+import hashlib
+from pathlib import Path
+from modules.commands.BaseCommand import BaseCommand
+
+
+class CompareCommand(BaseCommand):
+    def __init__(self, db_connection_string, execution_id, model_type, base_path, model_patterns, logger=None):
+        super().__init__(db_connection_string, logger)
+        self._execution_id = execution_id
+        self._model_type = model_type
+        self._base_path = base_path
+        self._model_patterns = model_patterns
+        self._changed_models_separator = ','
+
+    def execute(self):
+        model_folder = Path(self._base_path)
+        if not model_folder.is_dir():
+            raise NotADirectoryError(self._base_path)
+
+        current_model_checksums = {}
+        for model_pattern in self._model_patterns:
+            for model_file in model_folder.glob(model_pattern):
+                if model_file.is_file():
+                    current_model_checksums[model_file.stem] = self.__get_file_checksum(model_file)
+
+        data_pipeline_execution = self.repository.save_execution_progress(self._execution_id, self._model_type, current_model_checksums)
+        self.logger.debug(f'Comparing data_pipeline_execution = ${str(data_pipeline_execution)}')
+
+        previous_model_checksums = self.repository.get_last_successful_models(self._model_type)
+
+        if len(previous_model_checksums) == 0:
+            print('*')
+            self.logger.debug(f'Changed models: ALL')
+            return
+
+        changed_models = []
+        for model, current_checksum in current_model_checksums.items():
+            if model not in previous_model_checksums or previous_model_checksums[model] != current_checksum:
+                changed_models.append(model)
+
+        print(self._changed_models_separator.join(changed_models))
+        self.logger.debug(f'Changed models: \'${str(changed_models)}\'')
+
+    def __get_file_checksum(self, file: Path):
+        data = file.read_bytes()
+        hash_function = hashlib.sha256()
+        hash_function.update(data)
+        checksum = hash_function.hexdigest()
+        self.logger.debug(f'filename={file.name}, filepath=\'{file.absolute().as_posix()}\'')
+        self.logger.debug(f'hash_function={hash_function.name}, checksum_len={len(checksum)}, checksum={checksum}')
+        return checksum
@@ -0,0 +1,13 @@
+import hashlib
+from pathlib import Path
+from modules.commands.BaseCommand import BaseCommand
+
+
+class CompleteCommand(BaseCommand):
+    def __init__(self, db_connection_string, execution_id, logger=None):
+        super().__init__(db_connection_string, logger)
+        self._execution_id = execution_id
+
+    def execute(self):
+        data_pipeline_execution = self.repository.complete_execution(self._execution_id)
+        self.logger.debug('Completed data_pipeline_execution = ' + str(data_pipeline_execution))