Test & fix doc code (#7)

* Add tests for most examples
hpi-dhc · Aug 31, 2021 · 61b725e · 61b725e
1 parent c8a88dc
commit 61b725e
Show file tree

Hide file tree

Showing 8 changed files with 366 additions and 72 deletions.
diff --git a/.gitignore b/.gitignore
@@ -216,4 +216,6 @@ $RECYCLE.BIN/
 # End of https://www.gitignore.io/api/macos,linux,python,windows,jupyternotebook,visualstudiocode
 
 .idea
-.vscode
+.vscode
+
+**/tmp_test_file_gen.py
diff --git a/README.rst b/README.rst
@@ -42,15 +42,17 @@ Features
 * remove temporal offsets in the data
 * remove clock speed offsets by stretching the data
 
-Usage
------
+Installation
+------------
 
 Install the package from pypi:
 
 .. code:: bash
 
     pip install jointly
 
+Usage
+-----
 
 The data has to be provided in pandas ``DataFrame`` instances with a
 ``DateTimeIndex`` for each sensor. In the following example, ``Faros`` and ``Empatica``
@@ -61,26 +63,67 @@ in the ``DataFrame`` will be synchronized together with that column.
 
 .. code:: python
 
+    import pandas as pd
+    import tempfile
+    import traceback
+
     import jointly
 
+    # load source dataframes with datetime index
+    faros_df = pd.read_csv(
+        "./test-data/faros-plus-physilog/faros.csv.gz",
+        index_col=[0],
+        parse_dates=True
+    )
+    physilog_df = pd.read_csv(
+        "./test-data/faros-plus-physilog/physilog.csv.gz",
+        index_col=[0],
+        parse_dates=True,
+    )
+
+    # the magnitude is a common property that keeps shake information without axis relevance
+    faros_df["Accel Mag"] = jointly.calculate_magnitude(
+        faros_df, ["Accel X", "Accel Y", "Accel Z"]
+    )
+    physilog_df["Accel Mag"] = jointly.calculate_magnitude(
+        physilog_df, ["Accel X", "Accel Y", "Accel Z"]
+    )
+
+    # create dictionary of source sensors
     sources = {
-        'Faros': {
-            'data': faros_dataframe,
-            'ref_column': 'acc_mag',
+        "Faros": {
+            "data": faros_df,
+            "ref_column": "Accel Mag",
+        },
+        "Physilog": {
+            "data": physilog_df,
+            "ref_column": "Accel Mag",
         },
-        'Empatica': {
-            'data': empatica_dataframe,
-            'ref_column': 'acc_mag',
-        }
     }
-    # prepare the synchronizer
-    synchronizer = jointly.Synchronizer(sources, reference_source_name='Empatica')
 
-    # get a dictionary of: sensor -> synced DataFrame
-    synced_data = synchronizer.get_synced_data()
+    # set shake extraction parameters
+    extractor = jointly.ShakeExtractor()
+    extractor.start_window_length = pd.Timedelta(seconds=15)
+    extractor.end_window_length = pd.Timedelta(seconds=10)
+    extractor.min_length = 3
+    extractor.threshold = 0.55
 
-    # save a file for each input sensor
-    synchronizer.save_pickles("./synced-files/")
+    # prepare the synchronizer
+    synchronizer = jointly.Synchronizer(
+        sources, reference_source_name="Faros", extractor=extractor
+    )
+
+    # if the extractor parameters are wrong, print the problem and show the data
+    try:
+        # get_synced_data returns a dictionary of sensor names to synced DataFrames
+        synchronizer.get_synced_data()
+    except Exception:
+        traceback.print_exc()
+        jointly.plot_reference_columns(sources)
+
+    # save a file for each input sensor somewhere
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        synchronizer.save_pickles(tmp_dir)
 
 Template Credits
 ----------------

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -40,12 +40,13 @@ Or download the `tarball`_:
 
     $ curl  -OL https://github.com/hpi-dhc/jointly/tarball/master
 
-Once you have a copy of the source, you can install it with:
+Once you have a copy of the source, you can install it with `poetry`_:
 
 .. code-block:: console
 
-    $ python setup.py install
+    $ poetry install
 
 
 .. _Github repo: https://github.com/hpi-dhc/jointly
 .. _tarball: https://github.com/hpi-dhc/jointly/tarball/master
+.. _poetry: https://python-poetry.org/docs/#installation
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -3,59 +3,95 @@ Usage
 ==========
 
 
-Syncing data
-------------
+Preparing Data for Ingestion
+----------------------------
 
 The data has to be provided in pandas ``DataFrame`` with a
-``DateTimeIndex``. Each signal source, i.e., each sensor,
+``DateTimeIndex``. The following example shows how such a dataframe
+should look:
+
+.. code:: python
+
+    import pandas as pd
+
+    faros_df = pd.read_csv(
+        "./test-data/faros-plus-physilog/faros.csv.gz",
+        index_col=[0],
+        parse_dates=True
+    )
+    print(faros_df.head())
+
+The output of ``faros_df.head()`` shows that the index is a ``DateTimeIndex``.
+The ``NaN`` values due to the different sampling frequencies are ignored during synchronization.
+
+::
+
+                             Accel X  Accel Y  Accel Z   ECG
+    1970-01-01 00:00:01.000    -88.0    771.0   -531.5 -21.0
+    1970-01-01 00:00:01.008      NaN      NaN      NaN -10.0
+    1970-01-01 00:00:01.010    -86.0    779.0   -539.5   NaN
+    1970-01-01 00:00:01.016      NaN      NaN      NaN  -2.0
+    1970-01-01 00:00:01.020    -82.5    781.0   -543.0   NaN
+
+Each signal source, i.e., each sensor,
 is given in a dictionary together with the name of the column
 containing the events that should be synchronized, e.g., the
 shake common to all sensor signals in the acceleration magnitude.
+The name of that column and its frequency can be different for
+each sensor.
+
+Finally, given the source dictionary, the synchronizer instance
+can be created.
 
 .. code:: python
 
     import jointly
 
     sources = {
-        'Faros': {
-            'data': faros.data,
-            'ref_column': 'acc_mag',
+        "Faros": {
+            "data": faros_df,
+            "ref_column": "Accel Mag",
         },
-        'Empatica': {
-            'data': empatica.data,
-            'ref_column': 'acc_mag',
+        "Physilog": {
+            "data": physilog_df,
+            "ref_column": "Accel Mag",
         },
-        'Everion': {
-            'data': everion.data,
-            'ref_column': 'acc_mag',
-        }
+        # Any number of sensors can be added
+        # 'Everion': {
+        #     'data': everion_dataframe,
+        #     'ref_column': 'ACCELERATION_MAGNITUDE',
+        # }
     }
-    ref_source_name = 'Empatica'
 
-    extractor = jointly.ShakeExtractor()
-    synchronizer = jointly.Synchronizer(sources, ref_source_name, extractor)
-    synced_data = synchronizer.get_synced_data()
+    jointly.Synchronizer(sources, reference_source_name="Faros")
 
 Tuning Shake Detection
 ----------------------
 
-To optimize results of the shake detection, the following
-parameters typically need to be adjusted:
+If the shake detection doesn't find all shakes on the first try,
+the following parameters will help:
 
 .. code:: python
 
+    import pandas as pd
+    import jointly
+
     extractor = jointly.ShakeExtractor()
 
-    # this should contain only the start shakes in all data streams
-    extractor.start_window_length = pd.Timedelta(seconds=N)
+    # The start window should be long enough to contain
+    # only the start shake in every data stream
+    extractor.start_window_length = pd.Timedelta(seconds=15)
 
-    # this should contain only the end shakes in all data streams
+    # The end window (measured from the end of data)
+    # should be exactly long enough to contain
+    # only the end shake in every data stream
     extractor.end_window_length = pd.Timedelta(seconds=3)
 
-    # the number of shakes that were done, e.g., 3
+    # Set to at most the number of shakes you did
     extractor.min_length = 3
 
-    # the minimum height of the shakes, in a normalized range
+    # Shakes are only accepted if they are higher than the
+    # threshold (with all data normalized).
     extractor.threshold = 0.5
 
 Debugging
@@ -65,14 +101,23 @@ To find issues with the shake detection, it often helps to plot the data.
 ``plot_reference_columns`` is available to plot the reference columns from
 a source table.
 
+Problems during synchronization throw exceptions, such as a ``BadWindowException``:
+
+    jointly.synchronization_errors.BadWindowException:
+
+    Start (0 days 00:10:00) or end (0 days 00:10:00) window lengths greater than length of signal Faros (0 days 00:00:36.992000). Make it so each window only covers start or end, not both.
+
+Thus, the following code catches the problem and prints/shows helpful information:
+
 .. code:: python
 
+    # if the extractor parameters are wrong, print the problem and show the data
     try:
-        jointly.Synchronizer(sources, reference_signal)
-        sync_result = synchronizer.save_pickles(tmp_dir)
+        # get_synced_data returns a dictionary of sensor names to synced DataFrames
+        synchronizer.get_synced_data()
     except Exception:
         traceback.print_exc()
-        plot_reference_columns(sources)
+        jointly.plot_reference_columns(sources)
 
 
 Saving data
@@ -83,6 +128,10 @@ to create an export file for each data category, while ``save_pickles``
 dumps the synchronized dataframes for each individual sensor into a ``.pickle``
 each.
 
+To run the following examples, you should already have a ``Synchronizer`` instance
+called ``synchronizer`` with an extractor configured such that no exceptions are thrown.
+Check the readme file for an example.
+
 ``save_pickles()``
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -102,43 +151,49 @@ key at the root level defines the name of a corresponding file.
 In each entry, select the source columns by creating a key (for
 example, add ``Faros`` to select data from the ``Faros`` source)
 that points to the columns to be extracted from that source, e.g.,
-``['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z']``.
+``['Accel X', 'Accel Y', 'Accel Z']``.
 
 .. code:: python
 
+    # define output format for two files, one containing all acceleration
+    # data, the other the ECG data
     tables = {
         'ACC': {
-            'Faros': ['Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z'],
-            'Empatica': ['acc_x', 'acc_y', 'acc_z'],
-            'Everion': ['accx_data', 'accy_data', 'accz_data'],
-        },
-        'PPG': {
-            'Empatica': ['bvp'],
-            'Everion': ['blood_pulse_wave', 'led2_data', 'led3_data'],
-        },
-        'EDA': {
-            'Empatica': ['eda'],
-            'Everion': ['gsr_electrode'],
+            'Faros': ['Accel X', 'Accel Y', 'Accel Z'],
+            'Physilog': ['Accel X', 'Accel Y', 'Accel Z'],
         },
         'ECG': {
             'Faros': ['ECG'],
         },
-        'TEMP': {
-            'Empatica': ['temp'],
-            'Everion': ['temperature_object'],
-        },
-        'HR': {
-            'Empatica': ['hr'],
-            'Everion': ['heart_rate', 'heart_rate_quality'],
-        },
-        'IBI': {
-            'Faros': ['HRV'],
-            'Empatica': ['ibi'],
-            'Everion': ['inter_pulse_interval', 'inter_pulse_interval_deviation'],
-        }
     }
 
-    synchronizer.save_data(sync_dir_path, tables=tables, save_total_table=False)
+    # if the extractor parameters are wrong, print the problem and show the data
+    try:
+        # get_synced_data returns a dictionary of sensor names to synced DataFrames
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            synchronizer.save_data(tmp_dir, tables=tables, save_total_table=False)
+            print("test")
+    except Exception:
+        traceback.print_exc()
+        jointly.plot_reference_columns(sources)
+
+
+In the resulting CSV file, each combination gets a column like this:
+``Faros_Accel X``, or ``Physilog_Accel Z``, etc:
+
+::
+
+                                    Faros_Accel X    Faros_Accel Y    Faros_Accel Z    Physilog_Accel X    Physilog_Accel Y    Physilog_Accel Z
+    1970-01-01 00:00:01.000000000             -88              771           -531.5
+    1970-01-01 00:00:01.010000000             -86              779           -539.5
+    1970-01-01 00:00:01.020000000           -82.5              781             -543
+    1970-01-01 00:00:01.020907696                                                              -0.80457             0.02234             0.61023
+    1970-01-01 00:00:01.030000000             -98              787           -521.5
+    1970-01-01 00:00:01.040000000           -80.5              777             -557
+    1970-01-01 00:00:01.050000000             -94            761.5           -539.5
+    1970-01-01 00:00:01.052150462                                                              -0.81104             0.01721             0.59253
+
+
 
 Logging
 -------
@@ -147,8 +202,10 @@ To activate logging simply add the following lines to your code:
 
 .. code:: python
 
+    import logging
     from jointly.log import logger
-    logger.setLevel(10)
+
+    logger.setLevel(logging.DEBUG)
 
 This will give you insight into the shake detection, calculation of the
 timeshifts and stretching factor, and output plots of the segements.
diff --git a/jointly/__init__.py b/jointly/__init__.py
@@ -1,3 +1,5 @@
 from .abstract_extractor import *
 from .shake_extractor import *
 from .synchronizer import *
+from .helpers import *
+from .helpers_plotting import *
diff --git a/test-data/faros-plus-physilog/faros.csv.gz b/test-data/faros-plus-physilog/faros.csv.gz
diff --git a/test-data/faros-plus-physilog/physilog.csv.gz b/test-data/faros-plus-physilog/physilog.csv.gz
-Original file line number
+Diff line change
@@ Expand Up / @@ -216,4 +216,6 @@ $RECYCLE.BIN/ @@
     # End of https://www.gitignore.io/api/macos,linux,python,windows,jupyternotebook,visualstudiocode
     .idea
-    .vscode
+    .vscode
+    **/tmp_test_file_gen.py