diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index a8067b8a..a9dea69f 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -15,6 +15,9 @@ If you want to open an issue, make sure to follow these steps: Nice to see that you are interested in contributing to our code! Here is a general workflow for contributing to our project. +>[!NOTE] +>We are a privacy-focused project which thrives on trust and accountability. Therefore, we __do not allow any GenAI__ in this repository. + 1. Add an [issue](https://github.com/sodascience/metasyn/issues) describing the feature you want to add, or comment on an existing issue that you want to try solving it. The goal of this is to prevent you from spending time working on things that will not be accepted into our project. In general, we like to have some discussion on our issues before we start programming. 2. Fork the project and create a new branch on your fork (e.g., `add-cool-feature`) 3. Code the code πŸ’» diff --git a/.github/workflows/core-ci.yml b/.github/workflows/core-ci.yml index 7a329c16..1f859c3d 100644 --- a/.github/workflows/core-ci.yml +++ b/.github/workflows/core-ci.yml @@ -76,7 +76,7 @@ jobs: run: | metasyn schema -o current_schema.json MD5_DATA=(`md5sum current_schema.json`) - [[ ${MD5_DATA[0]} == "8e9351c9b1513a6586ad94fa8b82d94e" ]] + [[ ${MD5_DATA[0]} == "95a44bdef1e929e718a2c9647795b147" ]] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 2b9c3d01..083d3551 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,7 +7,7 @@ version: 2 # Set the version of Python and other tools you might need build: - os: ubuntu-20.04 + os: ubuntu-24.04 tools: python: "3.12" # @@ -21,4 +21,4 @@ python: - method: pip path: . extra_requirements: - - docs \ No newline at end of file + - docs diff --git a/docs/source/_templates/autosummary/module.rst b/docs/source/_templates/autosummary/module.rst new file mode 100644 index 00000000..3d15f255 --- /dev/null +++ b/docs/source/_templates/autosummary/module.rst @@ -0,0 +1,72 @@ +{{ fullname | escape | underline}} + +.. automodule:: {{ fullname }} + +{% if modules %} +.. rubric:: Submodules + +.. autosummary:: + :toctree: +{% for item in modules %} + {{ item }} +{%- endfor %} +{% endif %} + +{% if attributes %} +.. rubric:: Module Attributes + +.. autosummary:: +{% for item in attributes %} + {{ item }} +{%- endfor %} + +{% for item in attributes %} +.. autodata:: {{ item }} + :no-index: +{% endfor %} +{% endif %} + +{% if functions %} +.. rubric:: Functions + +.. autosummary:: +{% for item in functions %} + {{ item }} +{%- endfor %} + +{% for item in functions %} +.. autofunction:: {{ item }} + :no-index: +{% endfor %} +{% endif %} + +{% if classes %} +.. rubric:: Classes + +.. autosummary:: +{% for item in classes %} + {{ item }} +{%- endfor %} + +{% for item in classes %} +.. autoclass:: {{ item }} + :members: + :show-inheritance: + :no-index: + +{% endfor %} +{% endif %} + +{% if exceptions %} +.. rubric:: Exceptions + +.. autosummary:: +{% for item in exceptions %} + {{ item }} +{%- endfor %} + +{% for item in exceptions %} +.. autoexception:: {{ item }} + :no-index: +{% endfor %} +{% endif %} diff --git a/docs/source/api/demo.rst b/docs/source/api/demo.rst deleted file mode 100644 index 6d74983e..00000000 --- a/docs/source/api/demo.rst +++ /dev/null @@ -1,10 +0,0 @@ -Demonstration datasets -====================== - -.. automodule:: metasyn.demo - :members: - :undoc-members: - :imported-members: - :inherited-members: - :private-members: - :show-inheritance: diff --git a/docs/source/api/developer_reference.rst b/docs/source/api/developer_reference.rst index 4d606505..c03732ec 100644 --- a/docs/source/api/developer_reference.rst +++ b/docs/source/api/developer_reference.rst @@ -1,86 +1,57 @@ Developer reference =================== -This section is intended for those that want to contribute to the metasyn package, or simply want a deeper understanding of how it works. It contains the classes, functions and modules that are not in the rest of the reference API. These are mostly elements that are not directly used by users, but are important for developers of the metasyn package to understand the architecture. - -DistSpec -^^^^^^^^ - -.. autoclass:: metasyn.varspec.DistributionSpec - :members: - :undoc-members: - :show-inheritance: - -metasyn.config module -^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.config - :members: - :undoc-members: - :show-inheritance: - - -metasyn.registry module -^^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.registry - :members: - :undoc-members: - :show-inheritance: - -metasyn.var module -^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.var - :members: - :undoc-members: - :show-inheritance: - -metasyn.testutils module -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.testutils - :members: - :undoc-members: - :show-inheritance: - -metasyn.validation module -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.validation - :members: - :undoc-members: - :show-inheritance: - -metasyn.privacy module -^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.privacy - :members: - :undoc-members: - :show-inheritance: - -metasyn.util module -^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.util - :members: - :undoc-members: - :show-inheritance: - -metasyn.file module -^^^^^^^^^^^^^^^^^^^ -.. automodule:: metasyn.file - :members: - :private-members: - :undoc-members: - :show-inheritance: - -metasyn.distribution.base module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.distribution.base - :members: - :show-inheritance: - :private-members: - +This section is intended for those that want to contribute to the metasyn package, or simply want a deeper understanding of how it works. It contains all the classes, functions and modules in the metasyn package. Most elements are not directly used by users, but are important for developers of the metasyn package to understand the architecture. + +.. currentmodule:: metasyn + +Modules +------- + +.. autosummary:: + :toctree: generated + :recursive: + + config + demo + distribution + file + metaframe + privacy + registry + schema + testutils + util + validation + var + varspec + +Top-level classes +----------------- + +.. autosummary:: + :toctree: generated + + MetaFrame + MetaVar + VarSpec + +Top-level functions +------------------- + +.. autosummary:: + :toctree: generated + + demo_dataframe + demo_file + metadist + read_csv + read_dta + read_excel + read_sav + read_tsv + write_csv + write_dta + write_excel + write_sav + write_tsv diff --git a/docs/source/api/metaframe.rst b/docs/source/api/metaframe.rst index ab73bac0..a8ec8305 100644 --- a/docs/source/api/metaframe.rst +++ b/docs/source/api/metaframe.rst @@ -5,3 +5,4 @@ MetaFrame :members: :undoc-members: :show-inheritance: + :no-index: diff --git a/docs/source/api/metasyn.demo.rst b/docs/source/api/metasyn.demo.rst index 6d74983e..7ad69013 100644 --- a/docs/source/api/metasyn.demo.rst +++ b/docs/source/api/metasyn.demo.rst @@ -8,3 +8,4 @@ Demonstration datasets :inherited-members: :private-members: :show-inheritance: + :no-index: \ No newline at end of file diff --git a/docs/source/api/metasyn.rst b/docs/source/api/metasyn.rst index 62b96591..b71e92da 100644 --- a/docs/source/api/metasyn.rst +++ b/docs/source/api/metasyn.rst @@ -10,5 +10,6 @@ This section aims to give an overview of all classes, functions and methods in t metasyn.demo metaframe varspec + multiframe developer_reference diff --git a/docs/source/api/multiframe.rst b/docs/source/api/multiframe.rst new file mode 100644 index 00000000..cbd38e28 --- /dev/null +++ b/docs/source/api/multiframe.rst @@ -0,0 +1,7 @@ +Multiframe +========== + +.. automodule:: metasyn.multiframe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/varspec.rst b/docs/source/api/varspec.rst index 170ee58c..fdd33f54 100644 --- a/docs/source/api/varspec.rst +++ b/docs/source/api/varspec.rst @@ -6,4 +6,5 @@ VarSpec :members: :undoc-members: :show-inheritance: + :no-index: diff --git a/docs/source/cli.rst b/docs/source/cli.rst index ab69b9a9..78775b13 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -73,13 +73,13 @@ Here's how you can use Docker to access Metasyn's CLI: .. note:: - You can also specify which ``metasyn`` version to use in docker, by adding a tag to the docker image. For example, to use version 1.1.0, you can use the following command: + You can also specify which ``metasyn`` version to use in docker, by adding a tag to the docker image. For example, to use version 2.1.0, you can use the following command: .. tab:: Installing a specific version .. code-block:: console - docker pull sodateam/metasyn:v1.1.0 + docker pull sodateam/metasyn:v2.1.0 .. tab:: Using a command on a specific version @@ -87,13 +87,13 @@ Here's how you can use Docker to access Metasyn's CLI: .. code-block:: console - docker run -v %cd%:/wd sodateam/metasyn:v1.1.0 --help + docker run -v %cd%:/wd sodateam/metasyn:v2.1.0 --help .. tab:: Unix or MacOS: .. code-block:: console - docker run -v $(pwd):/wd sodateam/metasyn:v1.1.0 --help + docker run -v $(pwd):/wd sodateam/metasyn:v2.1.0 --help Creating generative metadata diff --git a/docs/source/conf.py b/docs/source/conf.py index 090f58cc..6644560e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,7 +25,7 @@ # The full version, including alpha/beta/rc tags -release = '2.0.0' +release = '2.1.0' # -- General configuration --------------------------------------------------- @@ -66,7 +66,7 @@ html_theme_options = { "logo_only": True, "navigation_depth": -1, - "display_version": True, + "version_selector": True, "style_external_links": False, } diff --git a/docs/source/index.rst b/docs/source/index.rst index 9e233738..a177d5a0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -47,6 +47,7 @@ for data owners to share the structure and approximation of the content of their improve_synth datafree + multiframe .. toctree:: :hidden: diff --git a/docs/source/multiframe.rst b/docs/source/multiframe.rst new file mode 100644 index 00000000..fcbfcbab --- /dev/null +++ b/docs/source/multiframe.rst @@ -0,0 +1,122 @@ +Multiple table generation +========================= + +.. currentmodule:: metasyn.multiframe + +From version 2.1 onwards, metasyn implements primary key/foreign key relations. +You might have multiple tables where one column in one table references another +column in another table. It might be important for the utility of the synthetic data that +this relationship is also present in the synthetic data. + +Consider for example the very simple set of two tables with passengers and their medical data. + +.. tab:: passengers.csv + + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + | PassengerId | Name | Sex | Age | Parch | Ticket | Fare | Cabin | Embarked | Birthday | Board time | Married since | all\_NA | + +=============+=========================================================+==========+=====+=======+====================+=========+========+==========+============+============+=====================+=========+ + | 1 | "Braund, Mr. Owen Harris" | "male" | 22 | 0 | "A/5 21171" | 7.25 | null | "S" | 1937-10-28 | 15:53:04 | 2022-08-05 04:43:34 | null | + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + | 2 | "Cumings, Mrs. John Bradley \(Florence Briggs Thayer\)" | "female" | 38 | 0 | "PC 17599" | 71.2833 | "C85" | "C" | null | 12:26:00 | 2022-08-07 01:56:33 | null | + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + | 3 | "Heikkinen, Miss. Laina" | "female" | 26 | 0 | "STON/O2. 3101282" | 7.925 | null | "S" | 1931-09-24 | 16:08:25 | 2022-08-04 20:27:37 | null | + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + | 4 | "Futrelle, Mrs. Jacques Heath \(Lily May Peel\)" | "female" | 35 | 0 | "113803" | 53.1 | "C123" | "S" | 1936-11-30 | null | 2022-08-07 07:05:55 | null | + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + | 5 | "Allen, Mr. William Henry" | "male" | 35 | 0 | "373450" | 8.05 | null | "S" | 1918-11-07 | 10:59:08 | 2022-08-02 15:13:34 | null | + +-------------+---------------------------------------------------------+----------+-----+-------+--------------------+---------+--------+----------+------------+------------+---------------------+---------+ + +.. tab:: medical_data.csv + + +-------------+---------------------------------------------------------+ + | PassengerId | Medical condition | + +=============+=========================================================+ + | 3 | "Healthy" | + +-------------+---------------------------------------------------------+ + | 1 | "Fever" | + +-------------+---------------------------------------------------------+ + | 4 | "Unknown" | + +-------------+---------------------------------------------------------+ + +In our analysis we might want to combine the two tables; for example we might want to analyze relate the medical condition +and the age of the passenger. For this you will need to join the tables. This might pose a problem if you generate +the synthetic tables independently of each other. The synthetic ``medical_data.csv`` version might generate ``PassengerId`` +values that do not occur in the ``passengers.csv`` table, while their original versions do. + +Metasyn provides the :class:`metasyn.multiframe.Multiframe` class to synthesize multiple tables at once and define +relations between columns across tables. + +Column relations +---------------- + +Metasyn implements a few different kinds of relations between columns: ``subset`` (``SUBSET OF``), ``equal`` (``EQUALS``) and +``equal_ordered`` (``EQUAL ORDERED``). There is one extra relation ``infer`` (``INFER FROM``), which signals to metasyn to attempts to +infer the relation automatically. There are two ways to define a relation between two columns: one using a string, the other +using the :class:`metasyn.multiframe.ColumnRelation` class: + +.. tab:: string + + .. code-block:: python + + from metasyn.multiframe import ColumnRelation + + relation_str = "medical_data.csv[PassengerId] SUBSET OF passengers.csv[PassengerId]" + relation = ColumnRelation.parse(relation_str) + +.. tab:: direct + + .. code-block:: python + + from metasyn.multiframe import ColumnRelation, RelationType + + relation = ColumnRelation(primary_table="passengers.csv", primary_key="PassengerId", + foreign_table="medical_data.csv", foreign_key="PassengerId", + relation_type=RelationType.Subset) + +Multiframe +---------- + +The :class:`metasyn.multiframe.MultiFrame` class is the equivalent of the :class:`metasyn.metaframe.MetaFrame` for +multiple tables. The class can be created directly using initialized metaframes or you can use the :meth:`metasyn.multiframe.MultiFrame.fit_dataframes` method. + + +.. tab:: fit dataframes + + .. code-block:: python + + from metasyn.multiframe import MultiFrame + + dfs = {"a": pl.read_csv(...), "b": pl.read_csv(...)} + + relations = ["b[passengerId] SUBSET OF a[ID]", "a[userId] <= b[userId]"] + multi_frame = MultiFrame.fit_dataframes(dfs, relations=relations, extra_kwargs={"a": {...}, "b": {...}) + +.. tab:: direct initialization + + .. code-block:: python + + from metasyn.multiframe import MultiFrame + + dfs = {"a": pl.read_csv(...), "b": pl.read_csv(...)} + mfs = {"a": MetaFrame.fit_dataframe(dfs["a"], ...), "b": MetaFrame.fit_dataframe(dfs["b"], ...)} + + + relations = ["b[passengerId] SUBSET OF a[ID]", "b[userId] EQUALS a[userId]"] + multi_frame = MultiFrame(mfs, relations=relations, dataframes=dfs) + + +Synthesizing multiple tables +---------------------------- + +Similar to the :class:`metasyn.metaframe.MetaFrame` class, the :class:`MultiFrame` class has a :meth:`MultiFrame.synthesize` method to generate +synthetic dataframes. Tables can have a different number of rows and can be set during generation of the synthetic dataset. + +.. code-block:: python + + multiframe.synthesize(n={"a": 100, "b": 200}) + +.. note:: + + In contrast to single table synthesis, you might not be able to independently set the number of rows for each + table. For example, if one table has an "equal" relationship with another table, the two tables should have the + same size. \ No newline at end of file diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index 7ff3bbb4..a9236cb7 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -8,3 +8,6 @@ to help you get started. You can access it here: - Google Colab: `Open in Google Colab `__ - GitHub: `View on GitHub `__ + +Multiple table tutorial on `Google Colab `__ or +`GitHub `__. \ No newline at end of file diff --git a/docs/source/what_is_metasyn.rst b/docs/source/what_is_metasyn.rst index 42d59684..fdb09d18 100644 --- a/docs/source/what_is_metasyn.rst +++ b/docs/source/what_is_metasyn.rst @@ -31,6 +31,7 @@ Key features - **Integration with Faker**: Metasyn integrates with the `faker `__ package, a Python library for generating fake data such as names and emails. Allowing for more realistic synthetic data. - **Structured String Detection**: Metasyn identifies structured strings within your dataset, which can include formatted text, codes, identifiers, or any string that follows a specific pattern. - **Handling Unique Values**: Metasyn can identify and process variables with unique values or keys in the data, preserving their uniqueness in the synthetic dataset. +- **Multiple table support**: Metasyn supports the synthesis of multiple tables at the same time. Primary/foreign key relations can be preserved so that the synthetic tables can be joined correctly. For more detail on how metasyn works, see our `paper `_. diff --git a/examples/basic_example.py b/examples/basic_example.py index f1b9bc6b..e85cd5c1 100644 --- a/examples/basic_example.py +++ b/examples/basic_example.py @@ -1,10 +1,10 @@ from pathlib import Path -from metasyn import MetaFrame, demo_dataframe +from metasyn import MetaFrame, demo_data from metasyn.config import VarSpec # example dataframe from polars website -df = demo_dataframe("fruit") +df = demo_data("fruit") # set A to unique and B to not unique specs = [ diff --git a/examples/config_files/example_all.toml b/examples/config_files/example_all.toml index 8df39ed6..1e8dfc41 100644 --- a/examples/config_files/example_all.toml +++ b/examples/config_files/example_all.toml @@ -283,10 +283,10 @@ name = "core.truncated_normal" unique = false [var.distribution.parameters] -lower = -1 -upper = 2 +lower = 2 +upper = 20 mean = 0 -sd = 1 +sd = 50 [[var]] @@ -313,8 +313,8 @@ name = "core.normal" unique = false [var.distribution.parameters] -mean = 0 -sd = 1 +mean = 2 +sd = 20 [[var]] diff --git a/examples/getting_started.ipynb b/examples/getting_started.ipynb index 64669ba5..90886fe1 100644 --- a/examples/getting_started.ipynb +++ b/examples/getting_started.ipynb @@ -18,7 +18,10 @@ "cell_type": "markdown", "id": "f5c6597b", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Step 0: Install the metasyn package and import required packages" @@ -28,7 +31,10 @@ "cell_type": "markdown", "id": "ce44185723aaacd", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "First, let's install the metasyn package." @@ -49,7 +55,10 @@ "cell_type": "markdown", "id": "58abb192e714247c", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Now, let's import the required packages." @@ -166,13 +175,13 @@ "text/plain": [ "Schema([('PassengerId', Int64),\n", " ('Name', String),\n", - " ('Sex', Categorical(ordering='physical')),\n", + " ('Sex', Categorical),\n", " ('Age', Int64),\n", " ('Parch', Int64),\n", " ('Ticket', String),\n", " ('Fare', Float64),\n", " ('Cabin', String),\n", - " ('Embarked', Categorical(ordering='physical')),\n", + " ('Embarked', Categorical),\n", " ('Birthday', Date),\n", " ('Board time', Time),\n", " ('Married since', Datetime(time_unit='us', time_zone=None)),\n", @@ -201,7 +210,10 @@ "execution_count": 5, "id": "c72c2acb55fca193", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -214,7 +226,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 14)
statisticPassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
strf64strstrf64f64strf64strstrstrstrstrstr
"count"891.0"891""891"714.0891.0"891"891.0"204""889""813""812""799""0"
"null_count"0.0"0""0"177.00.0"0"0.0"687""2""78""79""92""891"
"mean"446.0nullnull29.6932770.381594null32.204208nullnull"1921-07-27 22:08:24.798000""14:38:10.014778""2022-07-31 03:43:48.767209"null
"std"257.353842nullnull14.5245270.806057null49.693429nullnullnullnullnullnull
"min"1.0"Abbing, Mr. Anthony"null0.00.0"110152"0.0"A10"null"1903-07-28""10:39:40""2022-07-15 12:21:15"null
"25%"224.0nullnull20.00.0null7.925nullnull"1911-09-18""12:39:02""2022-07-23 11:16:56"null
"50%"446.0nullnull28.00.0null14.4542nullnull"1922-03-26""14:29:34""2022-07-31 00:36:56"null
"75%"669.0nullnull38.00.0null31.0nullnull"1930-08-29""16:40:12""2022-08-08 03:35:52"null
"max"891.0"van Melkebeke, Mr. Philemon"null80.06.0"WE/P 5735"512.3292"T"null"1940-05-27""18:39:28""2022-08-15 10:32:05"null
" + "shape: (9, 14)
statisticPassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
strf64strstrf64f64strf64strstrstrstrstrstr
"count"891.0"891""891"714.0891.0"891"891.0"204""889""813""812""799""0"
"null_count"0.0"0""0"177.00.0"0"0.0"687""2""78""79""92""891"
"mean"446.0nullnull29.6932770.381594null32.204208nullnull"1921-07-27 22:08:24.797048""14:38:10.014778""2022-07-31 03:43:48.767209"null
"std"257.353842nullnull14.5245270.806057null49.693429nullnullnullnullnullnull
"min"1.0"Abbing, Mr. Anthony"null0.00.0"110152"0.0"A10"null"1903-07-28""10:39:40""2022-07-15 12:21:15"null
"25%"224.0nullnull20.00.0null7.925nullnull"1911-09-18""12:39:02""2022-07-23 11:16:56"null
"50%"446.0nullnull28.00.0null14.4542nullnull"1922-03-26""14:29:34""2022-07-31 00:36:56"null
"75%"669.0nullnull38.00.0null31.0nullnull"1930-08-29""16:40:12""2022-08-08 03:35:52"null
"max"891.0"van Melkebeke, Mr. Philemon"null80.06.0"WE/P 5735"512.3292"T"null"1940-05-27""18:39:28""2022-08-15 10:32:05"null
" ], "text/plain": [ "shape: (9, 14)\n", @@ -228,7 +240,7 @@ "β”‚ null_count ┆ 0.0 ┆ 0 ┆ 0 ┆ … ┆ 78 ┆ 79 ┆ 92 ┆ 891 β”‚\n", "β”‚ mean ┆ 446.0 ┆ null ┆ null ┆ … ┆ 1921-07-27 ┆ 14:38:10.0 ┆ 2022-07-31 ┆ null β”‚\n", "β”‚ ┆ ┆ ┆ ┆ ┆ 22:08:24.7 ┆ 14778 ┆ 03:43:48.7 ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ 98000 ┆ ┆ 67209 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 97048 ┆ ┆ 67209 ┆ β”‚\n", "β”‚ std ┆ 257.353842 ┆ null ┆ null ┆ … ┆ null ┆ null ┆ null ┆ null β”‚\n", "β”‚ min ┆ 1.0 ┆ Abbing, ┆ null ┆ … ┆ 1903-07-28 ┆ 10:39:40 ┆ 2022-07-15 ┆ null β”‚\n", "β”‚ ┆ ┆ Mr. ┆ ┆ ┆ ┆ ┆ 12:21:15 ┆ β”‚\n", @@ -275,7 +287,10 @@ "cell_type": "markdown", "id": "98b5d2759a1381f1", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Generating a MetaFrame is simple, and can be done by simply calling the `MetaFrame.fit_dataframe()` class method, passing in the DataFrame as a parameter." @@ -291,7 +306,7 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:03<00:00, 3.89variables/s]\n" + " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:03<00:00, 3.57variables/s]\n" ] } ], @@ -346,7 +361,7 @@ "\n", "Column 3: \"Sex\"\n", "- Variable Type: categorical\n", - "- Data Type: Categorical(ordering='physical')\n", + "- Data Type: Categorical\n", "- Proportion of Missing Values: 0.0000\n", "- Distribution:\n", "\t- Type: core.multinoulli\n", @@ -377,8 +392,8 @@ "\t- Parameters:\n", "\t\t- lower: -1e-08\n", "\t\t- upper: 6.00000001\n", - "\t\t- mean: -380.6440825743838\n", - "\t\t- sd: 12.066012048277289\n", + "\t\t- mean: -425.30866115809334\n", + "\t\t- sd: 12.827869304621283\n", "\t\n", "\n", "Column 6: \"Ticket\"\n", @@ -413,7 +428,7 @@ "\n", "Column 9: \"Embarked\"\n", "- Variable Type: categorical\n", - "- Data Type: Categorical(ordering='physical')\n", + "- Data Type: Categorical\n", "- Proportion of Missing Values: 0.0022\n", "- Distribution:\n", "\t- Type: core.multinoulli\n", @@ -508,7 +523,10 @@ "cell_type": "markdown", "id": "a79120f6b907d352", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "The GMF file should now be saved to the specified filepath, feel free to open and inspect it!" @@ -518,7 +536,10 @@ "cell_type": "markdown", "id": "fea40d1407e42ea", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "It's also possible to preview how the saved file would look, without actually saving it to disk. This can be done as follows:" @@ -529,7 +550,10 @@ "execution_count": 9, "id": "2b72d8907ec04999", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -558,7 +582,10 @@ "execution_count": 10, "id": "c5eac7eeb3326f03", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -570,7 +597,10 @@ "cell_type": "markdown", "id": "6571873d", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Step 4: Generating synthetic data" @@ -580,7 +610,10 @@ "cell_type": "markdown", "id": "85201666a67a73fd", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Once a MetaFrame is loaded, synthetic data can be generated from it. We can do so by using the the `synthesize` method of the MetaFrame, passing in how many rows the generated data should contain as a parameter. This returns a DataFrame with the synthetic data.\n", @@ -598,7 +631,27 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 208.45variables/s]\n" + " PassengerId: 0%| | 0/13 [00:00\n", - "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Mr. Tonight. Heart. Ball. Stan…"female"110"LU.A. 800508"15.125304null"Q"1936-01-1417:27:122022-08-12 19:54:15null
2"Condition. After.""female"530"44685"29.62215"E0""C"1904-01-13null2022-08-08 00:55:54null
3"Research.""male"10"0856"5.915536"B6""S"1910-03-0511:39:142022-07-24 13:45:17null
4"Remember. Speak protect.""male"460"2480"21.383541null"S"1921-09-2612:43:462022-08-08 16:23:10null
5"Think. Yes. Star.""male"null1"UMJ 1910"20.597204"D052""S"nullnull2022-08-06 15:52:49null
" + "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Claim. Top.""female"140"NNKW 94614"86.603806"F 17""C"1913-10-0512:28:542022-08-02 07:41:26null
2"Audience. My.""male"null0"70616"11.019336null"S"1911-07-1813:14:062022-08-12 13:57:25null
3"Machine. Individual. Ago.""male"430"032202"6.283949null"S"1915-11-2613:44:472022-08-15 01:09:36null
4"Hundred. Force. Pretty. Focus.""male"400"5688"84.840104null"S"1919-10-0915:15:102022-08-06 04:52:12null
5"Blood. Bad. Bad.""male"320"182925"13.324082null"S"1918-05-10null2022-08-06 20:54:57null
" ], "text/plain": [ "shape: (5, 13)\n", @@ -645,18 +704,19 @@ "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ str β”‚\n", "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════════════β•ͺ════════β•ͺ══════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ══════════════β•ͺ════════║\n", - "β”‚ 1 ┆ Mr. Tonight. ┆ female ┆ 11 ┆ … ┆ 1936-01-14 ┆ 17:27:12 ┆ 2022-08-12 ┆ null β”‚\n", - "β”‚ ┆ Heart. Ball. ┆ ┆ ┆ ┆ ┆ ┆ 19:54:15 ┆ β”‚\n", - "β”‚ ┆ Stan… ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", - "β”‚ 2 ┆ Condition. ┆ female ┆ 53 ┆ … ┆ 1904-01-13 ┆ null ┆ 2022-08-08 ┆ null β”‚\n", - "β”‚ ┆ After. ┆ ┆ ┆ ┆ ┆ ┆ 00:55:54 ┆ β”‚\n", - "β”‚ 3 ┆ Research. ┆ male ┆ 1 ┆ … ┆ 1910-03-05 ┆ 11:39:14 ┆ 2022-07-24 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13:45:17 ┆ β”‚\n", - "β”‚ 4 ┆ Remember. ┆ male ┆ 46 ┆ … ┆ 1921-09-26 ┆ 12:43:46 ┆ 2022-08-08 ┆ null β”‚\n", - "β”‚ ┆ Speak ┆ ┆ ┆ ┆ ┆ ┆ 16:23:10 ┆ β”‚\n", - "β”‚ ┆ protect. ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", - "β”‚ 5 ┆ Think. Yes. ┆ male ┆ null ┆ … ┆ null ┆ null ┆ 2022-08-06 ┆ null β”‚\n", - "β”‚ ┆ Star. ┆ ┆ ┆ ┆ ┆ ┆ 15:52:49 ┆ β”‚\n", + "β”‚ 1 ┆ Claim. Top. ┆ female ┆ 14 ┆ … ┆ 1913-10-05 ┆ 12:28:54 ┆ 2022-08-02 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 07:41:26 ┆ β”‚\n", + "β”‚ 2 ┆ Audience. ┆ male ┆ null ┆ … ┆ 1911-07-18 ┆ 13:14:06 ┆ 2022-08-12 ┆ null β”‚\n", + "β”‚ ┆ My. ┆ ┆ ┆ ┆ ┆ ┆ 13:57:25 ┆ β”‚\n", + "β”‚ 3 ┆ Machine. ┆ male ┆ 43 ┆ … ┆ 1915-11-26 ┆ 13:44:47 ┆ 2022-08-15 ┆ null β”‚\n", + "β”‚ ┆ Individual. ┆ ┆ ┆ ┆ ┆ ┆ 01:09:36 ┆ β”‚\n", + "β”‚ ┆ Ago. ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", + "β”‚ 4 ┆ Hundred. ┆ male ┆ 40 ┆ … ┆ 1919-10-09 ┆ 15:15:10 ┆ 2022-08-06 ┆ null β”‚\n", + "β”‚ ┆ Force. ┆ ┆ ┆ ┆ ┆ ┆ 04:52:12 ┆ β”‚\n", + "β”‚ ┆ Pretty. ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", + "β”‚ ┆ Focus. ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", + "β”‚ 5 ┆ Blood. Bad. ┆ male ┆ 32 ┆ … ┆ 1918-05-10 ┆ null ┆ 2022-08-06 ┆ null β”‚\n", + "β”‚ ┆ Bad. ┆ ┆ ┆ ┆ ┆ ┆ 20:54:57 ┆ β”‚\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, @@ -689,7 +749,10 @@ "cell_type": "markdown", "id": "7d77224c9e212c06", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "The `MetaFrame.fit_dataframe()` method allows you to have more control over how your synthetic dataset is generated by passing in an optional `spec` (short for specification) parameter. `spec` is a dictionary that can be used to give metasyn instructions on a per-variable basis, these instructions can range from setting a variable to be unique, to directly setting its distribution. " @@ -717,8 +780,28 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 6.92variables/s]\n", - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 211.65variables/s]\n" + " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 6.87variables/s]\n", + " PassengerId: 0%| | 0/13 [00:00\n", - "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Steven Johnson""male"null0"098407"22.583406"D7B59""S"1937-08-1418:23:242022-07-17 08:19:10null
2"Vanessa Day""male"110"6632"10.218858null"S"1933-12-3015:17:032022-08-15 08:34:23null
3"Kayla Hogan""male"100"DA.A. 53765"41.529791"A55""C"1920-06-2214:38:13nullnull
4"Denise Ramirez""male"null0"899355"35.176823null"S"1923-09-29null2022-08-11 03:40:24null
5"Mary Stevens""male"370"177074"13.195501null"S"1909-09-1718:14:102022-08-14 05:35:23null
" + "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Terri Jackson""male"100"0556"11.055061null"S"1932-02-1914:44:282022-07-21 10:58:01null
2"Sarah Davenport""male"210"31324"20.714015null"S"1912-05-0215:52:172022-08-13 16:39:37null
3"Jennifer Johnson""female"361"WLH 447230"10.208813null"S"1910-11-1816:16:032022-08-10 22:00:46null
4"Pamela Crawford""male"81"35690"12.417835"A6""S"1909-05-1812:19:14nullnull
5"Morgan Perez""female"10"5064"6.986164"E219""S"1908-07-2217:20:192022-07-19 20:50:15null
" ], "text/plain": [ "shape: (5, 13)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married since ┆ all_NA β”‚\n", - "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", - "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ datetime[ΞΌs] ┆ str β”‚\n", - "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════════β•ͺ══════β•ͺ══════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ═══════════════β•ͺ════════║\n", - "β”‚ 1 ┆ Steven ┆ male ┆ null ┆ … ┆ 1937-08-14 ┆ 18:23:24 ┆ 2022-07-17 ┆ null β”‚\n", - "β”‚ ┆ Johnson ┆ ┆ ┆ ┆ ┆ ┆ 08:19:10 ┆ β”‚\n", - "β”‚ 2 ┆ Vanessa Day ┆ male ┆ 11 ┆ … ┆ 1933-12-30 ┆ 15:17:03 ┆ 2022-08-15 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 08:34:23 ┆ β”‚\n", - "β”‚ 3 ┆ Kayla Hogan ┆ male ┆ 10 ┆ … ┆ 1920-06-22 ┆ 14:38:13 ┆ null ┆ null β”‚\n", - "β”‚ 4 ┆ Denise ┆ male ┆ null ┆ … ┆ 1923-09-29 ┆ null ┆ 2022-08-11 ┆ null β”‚\n", - "β”‚ ┆ Ramirez ┆ ┆ ┆ ┆ ┆ ┆ 03:40:24 ┆ β”‚\n", - "β”‚ 5 ┆ Mary Stevens ┆ male ┆ 37 ┆ … ┆ 1909-09-17 ┆ 18:14:10 ┆ 2022-08-14 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 05:35:23 ┆ β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA β”‚\n", + "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- β”‚\n", + "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ str β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", + "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═══════════════β•ͺ════════β•ͺ═════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ══════════════β•ͺ════════║\n", + "β”‚ 1 ┆ Terri Jackson ┆ male ┆ 10 ┆ … ┆ 1932-02-19 ┆ 14:44:28 ┆ 2022-07-21 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 10:58:01 ┆ β”‚\n", + "β”‚ 2 ┆ Sarah ┆ male ┆ 21 ┆ … ┆ 1912-05-02 ┆ 15:52:17 ┆ 2022-08-13 ┆ null β”‚\n", + "β”‚ ┆ Davenport ┆ ┆ ┆ ┆ ┆ ┆ 16:39:37 ┆ β”‚\n", + "β”‚ 3 ┆ Jennifer ┆ female ┆ 36 ┆ … ┆ 1910-11-18 ┆ 16:16:03 ┆ 2022-08-10 ┆ null β”‚\n", + "β”‚ ┆ Johnson ┆ ┆ ┆ ┆ ┆ ┆ 22:00:46 ┆ β”‚\n", + "β”‚ 4 ┆ Pamela ┆ male ┆ 8 ┆ … ┆ 1909-05-18 ┆ 12:19:14 ┆ null ┆ null β”‚\n", + "β”‚ ┆ Crawford ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", + "β”‚ 5 ┆ Morgan Perez ┆ female ┆ 1 ┆ … ┆ 1908-07-22 ┆ 17:20:19 ┆ 2022-07-19 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 20:50:15 ┆ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, "execution_count": 13, @@ -781,7 +866,10 @@ "cell_type": "markdown", "id": "f1b8b5b434fe1f52", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "### Spec: Setting distributions manually\n", @@ -799,8 +887,28 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 9.16variables/s]\n", - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 303.66variables/s]\n" + " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 8.54variables/s]\n", + " PassengerId: 0%| | 0/13 [00:00\n", - "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Michael Garcia""female"null1"651404"1.80778null"S"1917-09-2813:43:162022-08-13 21:46:53null
2"David Black""male"310"997658"0.884637null"S"1936-03-0411:11:402022-08-03 16:35:45null
3"Mary Rivera""female"200"5137"0.599972null"S"1905-03-1916:58:132022-07-24 01:41:11null
4"Richard Cole""male"380"7669"3.11026null"S"1929-05-0914:46:13nullnull
5"Nathaniel Nguyen""male"350"50802"0.457758null"S"1925-04-0917:02:152022-07-29 07:49:40null
" + "shape: (5, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Eric Merritt""male"320"794550"0.555465"G0""S"1921-09-0918:17:142022-07-16 04:16:08null
2"Lisa Cole""male"390"958127"1.547403null"S"1910-11-1613:42:382022-07-26 00:33:24null
3"Tyler Mcguire""male"350"VJK.A. 0718"0.100114"B38""S"null12:01:472022-07-19 01:37:44null
4"Michael Craig""male"360"79724"0.79439null"S"1916-11-03null2022-07-26 11:18:06null
5"Kenneth Herman Jr.""male"340"48242"1.222847null"S"1913-06-2310:59:152022-08-01 00:48:58null
" ], "text/plain": [ "shape: (5, 13)\n", - "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", - "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA β”‚\n", - "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- β”‚\n", - "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ str β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", - "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════════════β•ͺ════════β•ͺ══════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ══════════════β•ͺ════════║\n", - "β”‚ 1 ┆ Michael ┆ female ┆ null ┆ … ┆ 1917-09-28 ┆ 13:43:16 ┆ 2022-08-13 ┆ null β”‚\n", - "β”‚ ┆ Garcia ┆ ┆ ┆ ┆ ┆ ┆ 21:46:53 ┆ β”‚\n", - "β”‚ 2 ┆ David Black ┆ male ┆ 31 ┆ … ┆ 1936-03-04 ┆ 11:11:40 ┆ 2022-08-03 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 16:35:45 ┆ β”‚\n", - "β”‚ 3 ┆ Mary Rivera ┆ female ┆ 20 ┆ … ┆ 1905-03-19 ┆ 16:58:13 ┆ 2022-07-24 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 01:41:11 ┆ β”‚\n", - "β”‚ 4 ┆ Richard Cole ┆ male ┆ 38 ┆ … ┆ 1929-05-09 ┆ 14:46:13 ┆ null ┆ null β”‚\n", - "β”‚ 5 ┆ Nathaniel ┆ male ┆ 35 ┆ … ┆ 1925-04-09 ┆ 17:02:15 ┆ 2022-07-29 ┆ null β”‚\n", - "β”‚ ┆ Nguyen ┆ ┆ ┆ ┆ ┆ ┆ 07:49:40 ┆ β”‚\n", - "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married since ┆ all_NA β”‚\n", + "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", + "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ datetime[ΞΌs] ┆ str β”‚\n", + "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════════β•ͺ══════β•ͺ═════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ═══════════════β•ͺ════════║\n", + "β”‚ 1 ┆ Eric Merritt ┆ male ┆ 32 ┆ … ┆ 1921-09-09 ┆ 18:17:14 ┆ 2022-07-16 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 04:16:08 ┆ β”‚\n", + "β”‚ 2 ┆ Lisa Cole ┆ male ┆ 39 ┆ … ┆ 1910-11-16 ┆ 13:42:38 ┆ 2022-07-26 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 00:33:24 ┆ β”‚\n", + "β”‚ 3 ┆ Tyler Mcguire ┆ male ┆ 35 ┆ … ┆ null ┆ 12:01:47 ┆ 2022-07-19 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 01:37:44 ┆ β”‚\n", + "β”‚ 4 ┆ Michael Craig ┆ male ┆ 36 ┆ … ┆ 1916-11-03 ┆ null ┆ 2022-07-26 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 11:18:06 ┆ β”‚\n", + "β”‚ 5 ┆ Kenneth Herman ┆ male ┆ 34 ┆ … ┆ 1913-06-23 ┆ 10:59:15 ┆ 2022-08-01 ┆ null β”‚\n", + "β”‚ ┆ Jr. ┆ ┆ ┆ ┆ ┆ ┆ 00:48:58 ┆ β”‚\n", + "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, "execution_count": 14, @@ -873,8 +981,28 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 12.79variables/s]\n", - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 230.20variables/s]\n" + " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 9.95variables/s]\n", + " PassengerId: 0%| | 0/13 [00:00\n", - "shape: (10, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Michael Cook""male"310"1982"0.941278"F08""S"1920-12-3116:59:502022-07-28 12:10:16null
2"Christopher Brown""male"251"2637"0.648354null"S"1925-11-2110:44:012022-07-24 17:35:48null
3"Deborah Patterson""male"201"745718"1.674188"F94""S"1937-03-1318:21:182022-08-01 00:26:41null
4"Jordan Nelson""female"null0"958648"0.565636null"S"1915-04-0117:24:012022-08-09 17:04:09null
5"William Lee""female"220"9592"0.683327null"S"1934-07-2512:19:102022-08-14 08:13:53null
6"Casey Hodges""male"310"67623"0.287831null"C"1940-03-1615:40:572022-08-03 23:25:15null
7"Christopher Wu""male"null0"81566"0.727777"E755""S"1927-01-1217:23:452022-07-17 23:04:52null
8"Jenna Ochoa""male"350"157316"4.320606"B463""S"1936-02-0811:07:572022-08-13 16:54:32null
9"Brent Kirk""female"null0"881526"0.722181"C090""S"1932-08-2118:12:382022-08-03 21:50:35null
10"Diana Hunter""male"null0"9108"0.385063"D074""S"1922-10-1417:14:062022-08-05 20:37:46null
" + "shape: (10, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
i64strcati64i64strf64strcatdatetimedatetime[ΞΌs]str
1"Collin Mendoza""male"370"67250"0.197071null"S"null12:39:262022-07-16 05:15:58null
2"Jermaine Wang""female"330"54933"0.474988null"S"1907-03-0116:19:512022-07-30 11:44:02null
3"Patricia Jones""male"350"096496"0.716661null"S"1936-03-1814:47:222022-08-06 00:43:21null
4"Vanessa Franco""male"null0"7617"0.667793null"S"1915-01-0111:09:532022-07-25 22:03:19null
5"Shannon Rivers""female"360"09197"0.430992null"S"1929-01-3116:01:162022-07-27 10:21:13null
6"Joyce Hunter""male"390"6331"5.940336null"S"1914-08-1717:49:102022-07-30 23:39:31null
7"Alejandra Jarvis""female"390"R 86409"1.437764"E302""C"1933-09-2417:07:262022-07-22 20:58:23null
8"Ashley Smith""male"null0"472594"2.146417null"Q"1940-04-1713:02:432022-08-05 13:51:38null
9"Richard Farrell""male"280"93605"1.609702null"S"1904-02-1011:40:562022-07-29 20:48:05null
10"Melissa Marquez""male"370"91545"0.326197null"C"1917-04-2815:25:54nullnull
" ], "text/plain": [ "shape: (10, 13)\n", @@ -897,26 +1025,26 @@ "β”‚ i64 ┆ str ┆ cat ┆ i64 ┆ ┆ date ┆ time ┆ --- ┆ str β”‚\n", "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════════════β•ͺ════════β•ͺ══════β•ͺ═══β•ͺ════════════β•ͺ════════════β•ͺ══════════════β•ͺ════════║\n", - "β”‚ 1 ┆ Michael Cook ┆ male ┆ 31 ┆ … ┆ 1920-12-31 ┆ 16:59:50 ┆ 2022-07-28 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 12:10:16 ┆ β”‚\n", - "β”‚ 2 ┆ Christopher ┆ male ┆ 25 ┆ … ┆ 1925-11-21 ┆ 10:44:01 ┆ 2022-07-24 ┆ null β”‚\n", - "β”‚ ┆ Brown ┆ ┆ ┆ ┆ ┆ ┆ 17:35:48 ┆ β”‚\n", - "β”‚ 3 ┆ Deborah ┆ male ┆ 20 ┆ … ┆ 1937-03-13 ┆ 18:21:18 ┆ 2022-08-01 ┆ null β”‚\n", - "β”‚ ┆ Patterson ┆ ┆ ┆ ┆ ┆ ┆ 00:26:41 ┆ β”‚\n", - "β”‚ 4 ┆ Jordan ┆ female ┆ null ┆ … ┆ 1915-04-01 ┆ 17:24:01 ┆ 2022-08-09 ┆ null β”‚\n", - "β”‚ ┆ Nelson ┆ ┆ ┆ ┆ ┆ ┆ 17:04:09 ┆ β”‚\n", - "β”‚ 5 ┆ William Lee ┆ female ┆ 22 ┆ … ┆ 1934-07-25 ┆ 12:19:10 ┆ 2022-08-14 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 08:13:53 ┆ β”‚\n", - "β”‚ 6 ┆ Casey Hodges ┆ male ┆ 31 ┆ … ┆ 1940-03-16 ┆ 15:40:57 ┆ 2022-08-03 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 23:25:15 ┆ β”‚\n", - "β”‚ 7 ┆ Christopher ┆ male ┆ null ┆ … ┆ 1927-01-12 ┆ 17:23:45 ┆ 2022-07-17 ┆ null β”‚\n", - "β”‚ ┆ Wu ┆ ┆ ┆ ┆ ┆ ┆ 23:04:52 ┆ β”‚\n", - "β”‚ 8 ┆ Jenna Ochoa ┆ male ┆ 35 ┆ … ┆ 1936-02-08 ┆ 11:07:57 ┆ 2022-08-13 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 16:54:32 ┆ β”‚\n", - "β”‚ 9 ┆ Brent Kirk ┆ female ┆ null ┆ … ┆ 1932-08-21 ┆ 18:12:38 ┆ 2022-08-03 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 21:50:35 ┆ β”‚\n", - "β”‚ 10 ┆ Diana Hunter ┆ male ┆ null ┆ … ┆ 1922-10-14 ┆ 17:14:06 ┆ 2022-08-05 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 20:37:46 ┆ β”‚\n", + "β”‚ 1 ┆ Collin ┆ male ┆ 37 ┆ … ┆ null ┆ 12:39:26 ┆ 2022-07-16 ┆ null β”‚\n", + "β”‚ ┆ Mendoza ┆ ┆ ┆ ┆ ┆ ┆ 05:15:58 ┆ β”‚\n", + "β”‚ 2 ┆ Jermaine ┆ female ┆ 33 ┆ … ┆ 1907-03-01 ┆ 16:19:51 ┆ 2022-07-30 ┆ null β”‚\n", + "β”‚ ┆ Wang ┆ ┆ ┆ ┆ ┆ ┆ 11:44:02 ┆ β”‚\n", + "β”‚ 3 ┆ Patricia ┆ male ┆ 35 ┆ … ┆ 1936-03-18 ┆ 14:47:22 ┆ 2022-08-06 ┆ null β”‚\n", + "β”‚ ┆ Jones ┆ ┆ ┆ ┆ ┆ ┆ 00:43:21 ┆ β”‚\n", + "β”‚ 4 ┆ Vanessa ┆ male ┆ null ┆ … ┆ 1915-01-01 ┆ 11:09:53 ┆ 2022-07-25 ┆ null β”‚\n", + "β”‚ ┆ Franco ┆ ┆ ┆ ┆ ┆ ┆ 22:03:19 ┆ β”‚\n", + "β”‚ 5 ┆ Shannon ┆ female ┆ 36 ┆ … ┆ 1929-01-31 ┆ 16:01:16 ┆ 2022-07-27 ┆ null β”‚\n", + "β”‚ ┆ Rivers ┆ ┆ ┆ ┆ ┆ ┆ 10:21:13 ┆ β”‚\n", + "β”‚ 6 ┆ Joyce Hunter ┆ male ┆ 39 ┆ … ┆ 1914-08-17 ┆ 17:49:10 ┆ 2022-07-30 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 23:39:31 ┆ β”‚\n", + "β”‚ 7 ┆ Alejandra ┆ female ┆ 39 ┆ … ┆ 1933-09-24 ┆ 17:07:26 ┆ 2022-07-22 ┆ null β”‚\n", + "β”‚ ┆ Jarvis ┆ ┆ ┆ ┆ ┆ ┆ 20:58:23 ┆ β”‚\n", + "β”‚ 8 ┆ Ashley Smith ┆ male ┆ null ┆ … ┆ 1940-04-17 ┆ 13:02:43 ┆ 2022-08-05 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 13:51:38 ┆ β”‚\n", + "β”‚ 9 ┆ Richard ┆ male ┆ 28 ┆ … ┆ 1904-02-10 ┆ 11:40:56 ┆ 2022-07-29 ┆ null β”‚\n", + "β”‚ ┆ Farrell ┆ ┆ ┆ ┆ ┆ ┆ 20:48:05 ┆ β”‚\n", + "β”‚ 10 ┆ Melissa ┆ male ┆ 37 ┆ … ┆ 1917-04-28 ┆ 15:25:54 ┆ null ┆ null β”‚\n", + "β”‚ ┆ Marquez ┆ ┆ ┆ ┆ ┆ ┆ ┆ β”‚\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, @@ -978,19 +1106,19 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
f64strcatf64f64strf64strcatdatetime[ms]timedatetime[ΞΌs]str
446.0nullnull29.6932770.381594null32.204208nullnull1921-07-27 22:08:24.79814:38:10.0147783252022-07-31 03:43:48.767209null
" + "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
f64strcatf64f64strf64strcatdatetime[ΞΌs]timedatetime[ΞΌs]str
446.0nullnull29.6932770.381594null32.204208nullnull1921-07-27 22:08:24.79704814:38:10.0147783252022-07-31 03:43:48.767209null
" ], "text/plain": [ "shape: (1, 13)\n", "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA β”‚\n", "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- β”‚\n", - "β”‚ f64 ┆ str ┆ cat ┆ f64 ┆ ┆ datetime[ms] ┆ time ┆ --- ┆ str β”‚\n", + "β”‚ f64 ┆ str ┆ cat ┆ f64 ┆ ┆ datetime[ΞΌs] ┆ time ┆ --- ┆ str β”‚\n", "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════β•ͺ══════β•ͺ═══════════β•ͺ═══β•ͺ═══════════════β•ͺ══════════════β•ͺ══════════════β•ͺ════════║\n", "β”‚ 446.0 ┆ null ┆ null ┆ 29.693277 ┆ … ┆ 1921-07-27 ┆ 14:38:10.014 ┆ 2022-07-31 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ 22:08:24.798 ┆ 778325 ┆ 03:43:48.767 ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 209 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 22:08:24.7970 ┆ 778325 ┆ 03:43:48.767 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 48 ┆ ┆ 209 ┆ β”‚\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, @@ -1013,14 +1141,30 @@ "name": "stderr", "output_type": "stream", "text": [ - " Name: 0%| | 0/13 [00:00\n", - "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
f64strcatf64f64strf64strcatdatetime[ms]timedatetime[ΞΌs]str
446.0nullnull29.4708390.079686null1.600957nullnull1922-01-13 18:40:11.65114:42:15.7502022-07-31 13:08:20.672110null
" + "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
f64strcatf64f64strf64strcatdatetime[ΞΌs]timedatetime[ΞΌs]str
446.0nullnull29.6425590.083053null1.761126nullnull1921-04-08 06:10:59.34066014:40:29.6046511622022-07-31 00:00:54.913473null
" ], "text/plain": [ "shape: (1, 13)\n", "β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”\n", "β”‚ PassengerId ┆ Name ┆ Sex ┆ Age ┆ … ┆ Birthday ┆ Board time ┆ Married ┆ all_NA β”‚\n", "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ since ┆ --- β”‚\n", - "β”‚ f64 ┆ str ┆ cat ┆ f64 ┆ ┆ datetime[ms] ┆ time ┆ --- ┆ str β”‚\n", + "β”‚ f64 ┆ str ┆ cat ┆ f64 ┆ ┆ datetime[ΞΌs] ┆ time ┆ --- ┆ str β”‚\n", "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ datetime[ΞΌs] ┆ β”‚\n", "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════β•ͺ══════β•ͺ═══════════β•ͺ═══β•ͺ═══════════════β•ͺ══════════════β•ͺ══════════════β•ͺ════════║\n", - "β”‚ 446.0 ┆ null ┆ null ┆ 29.470839 ┆ … ┆ 1922-01-13 ┆ 14:42:15.750 ┆ 2022-07-31 ┆ null β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ 18:40:11.651 ┆ ┆ 13:08:20.672 ┆ β”‚\n", - "β”‚ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 110 ┆ β”‚\n", + "β”‚ 446.0 ┆ null ┆ null ┆ 29.642559 ┆ … ┆ 1921-04-08 ┆ 14:40:29.604 ┆ 2022-07-31 ┆ null β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 06:10:59.3406 ┆ 651162 ┆ 00:00:54.913 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 60 ┆ ┆ 473 ┆ β”‚\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, @@ -1114,7 +1258,30 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 23.26variables/s]\n" + " PassengerId: 0%| | 0/13 [00:00\n", - "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
u32u32u32u32u32u32u32u32u32u32u32u32u32
0001790007073936797891
" + "shape: (1, 13)
PassengerIdNameSexAgeParchTicketFareCabinEmbarkedBirthdayBoard timeMarried sinceall_NA
u32u32u32u32u32u32u32u32u32u32u32u32u32
0001600006883886592891
" ], "text/plain": [ "shape: (1, 13)\n", @@ -1136,7 +1303,7 @@ "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", "β”‚ u32 ┆ u32 ┆ u32 ┆ u32 ┆ ┆ u32 ┆ u32 ┆ u32 ┆ u32 β”‚\n", "β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ══════β•ͺ═════β•ͺ═════β•ͺ═══β•ͺ══════════β•ͺ════════════β•ͺ═══════════════β•ͺ════════║\n", - "β”‚ 0 ┆ 0 ┆ 0 ┆ 179 ┆ … ┆ 93 ┆ 67 ┆ 97 ┆ 891 β”‚\n", + "β”‚ 0 ┆ 0 ┆ 0 ┆ 160 ┆ … ┆ 88 ┆ 65 ┆ 92 ┆ 891 β”‚\n", "β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”˜" ] }, @@ -1153,7 +1320,10 @@ "cell_type": "markdown", "id": "9ab5b2b2c56065b8", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Step 7: Adding descriptions to variables" @@ -1163,7 +1333,10 @@ "cell_type": "markdown", "id": "27bc0135d1eebc38", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "With the data being taken care of, we can still do one last thing. We can add descriptions to the variables, to clarify what they mean. This can be particularly useful when sharing the `MetaFrame` or generated data with others, as it gives them more context to what they're working with.\n", @@ -1176,14 +1349,17 @@ "execution_count": 20, "id": "e37faad4df8ffde8", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 12.89variables/s]\n" + " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:01<00:00, 9.81variables/s]\n" ] } ], @@ -1209,7 +1385,10 @@ "cell_type": "markdown", "id": "2b4bd251795308d5", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "We can get a list of all the descriptions in the fitted `MetaFrame` by accessing its `descriptions` property, as follows:" @@ -1220,7 +1399,10 @@ "execution_count": 21, "id": "694d4474707f7950", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1239,7 +1421,10 @@ "cell_type": "markdown", "id": "d8411571bf1e5bf7", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Instead of setting the description in the variable specification (which happens before fitting a `MetaFrame` to a `DataFrame`), we can assign a description to an already generated `MetaFrame` by directly setting a column's description attribute. For example, we can assign a description to the `PassengerId` column as follows:" @@ -1250,7 +1435,10 @@ "execution_count": 22, "id": "34a8aafc95c0219f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1271,7 +1459,10 @@ "cell_type": "markdown", "id": "5bf1c3ad724c9bfa", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "We can also set multiple descriptions of an already generated `MetaFrame` at once by passing in a dictionary of descriptions to its `descriptions` property. For example, we can set descriptions for the `Age` and `Name` columns as follows:" @@ -1282,7 +1473,10 @@ "execution_count": 23, "id": "3b2e873f7362160a", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1303,7 +1497,10 @@ "cell_type": "markdown", "id": "fcf3b9eb9bbf6b17", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Instead of a dictionary, it is also possible to pass in a list of descriptions to the `descriptions` property of a `MetaFrame`. \n", @@ -1318,7 +1515,10 @@ "execution_count": 24, "id": "ab2b34ed6b11578c", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -1355,7 +1555,27 @@ "name": "stderr", "output_type": "stream", "text": [ - " all_NA: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 13/13 [00:00<00:00, 116.09variables/s]\n" + " PassengerId: 0%| | 0/13 [00:00 foreign key relationship. This can be very useful when data from different tables have to be combined. Metasyn includes a multitable feature to capture these kinds of relations.\n", + "\n", + "To examplify this, let's perform a simple join to combine the purchases and customers table:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4ad9b293", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1_000, 7)
idcustomer_idprice_paidproduct_idaddresscredit_card_nrsignup_date
i64i64stri64stri64date
08579"$78,287.51"37931"68997 Kevin Summit\n", + "Lake Veroni…41036197625981051980-01-02
154694"$901.94"30783"7809 Shepherd Orchard\n", + "South Sa…5038782145201979-02-05
266228"$48.93"80060"20642 Andrew Springs\n", + "Changfurt…35882175064236272014-08-02
353735"$62,333.64"68273"54914 Alexis Village\n", + "Pamelache…45711003065502016-06-02
427958"$2.95"3520"2331 Bradley Cliffs Apt. 998\n", + "B…49796853821122023-07-30
99548110"$41,246.73"23588"69021 Nelson Spur\n", + "East John, C…43575471099529611981-04-23
99657809"$3,998.16"12079"0062 Ayers View Suite 421\n", + "Reed…42395349112498736401994-08-03
99750554"$143.18"30135"2200 Solis Mountains Apt. 469\n", + "…49295870720935539862000-09-02
99839707"$786.54"77079"75892 Cody Haven Suite 201\n", + "Jac…5018317015261972-09-14
99914278"$48,105.34"3277"4472 Graves Crossroad Suite 60…42355329226388131985-06-15
" + ], + "text/plain": [ + "shape: (1_000, 7)\n", + "β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ id ┆ customer_id ┆ price_paid ┆ product_id ┆ address ┆ credit_card_nr ┆ signup_date β”‚\n", + "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", + "β”‚ i64 ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ date β”‚\n", + "β•žβ•β•β•β•β•β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═══════════════════β•ͺ══════════════════β•ͺ═════════════║\n", + "β”‚ 0 ┆ 8579 ┆ $78,287.51 ┆ 37931 ┆ 68997 Kevin ┆ 4103619762598105 ┆ 1980-01-02 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Summit ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Lake Veroni… ┆ ┆ β”‚\n", + "β”‚ 1 ┆ 54694 ┆ $901.94 ┆ 30783 ┆ 7809 Shepherd ┆ 503878214520 ┆ 1979-02-05 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Orchard ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ South Sa… ┆ ┆ β”‚\n", + "β”‚ 2 ┆ 66228 ┆ $48.93 ┆ 80060 ┆ 20642 Andrew ┆ 3588217506423627 ┆ 2014-08-02 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Springs ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Changfurt… ┆ ┆ β”‚\n", + "β”‚ 3 ┆ 53735 ┆ $62,333.64 ┆ 68273 ┆ 54914 Alexis ┆ 4571100306550 ┆ 2016-06-02 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Village ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Pamelache… ┆ ┆ β”‚\n", + "β”‚ 4 ┆ 27958 ┆ $2.95 ┆ 3520 ┆ 2331 Bradley ┆ 4979685382112 ┆ 2023-07-30 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Cliffs Apt. 998 ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ B… ┆ ┆ β”‚\n", + "β”‚ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … β”‚\n", + "β”‚ 995 ┆ 48110 ┆ $41,246.73 ┆ 23588 ┆ 69021 Nelson Spur ┆ 4357547109952961 ┆ 1981-04-23 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ East John, C… ┆ ┆ β”‚\n", + "β”‚ 996 ┆ 57809 ┆ $3,998.16 ┆ 12079 ┆ 0062 Ayers View ┆ 4239534911249873 ┆ 1994-08-03 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Suite 421 ┆ 640 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Reed… ┆ ┆ β”‚\n", + "β”‚ 997 ┆ 50554 ┆ $143.18 ┆ 30135 ┆ 2200 Solis ┆ 4929587072093553 ┆ 2000-09-02 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Mountains Apt. ┆ 986 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ 469 ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ … ┆ ┆ β”‚\n", + "β”‚ 998 ┆ 39707 ┆ $786.54 ┆ 77079 ┆ 75892 Cody Haven ┆ 501831701526 ┆ 1972-09-14 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Suite 201 ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Jac… ┆ ┆ β”‚\n", + "β”‚ 999 ┆ 14278 ┆ $48,105.34 ┆ 3277 ┆ 4472 Graves ┆ 4235532922638813 ┆ 1985-06-15 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ Crossroad Suite ┆ ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ 60… ┆ ┆ β”‚\n", + "β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"purchases\"].join(data[\"customers\"], left_on=\"customer_id\", right_on=\"id\")" + ] + }, + { + "cell_type": "markdown", + "id": "74ad21ee", + "metadata": {}, + "source": [ + "### Synthesizing unrelated tables\n", + "\n", + "Now, let us naively generate synthetic data independently using metasyn without specifying any relations." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "470254c7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " signup_date: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:02<00:00, 1.66variables/s]\n", + " stock: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 6.90variables/s]\n", + " product_id: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 8.03variables/s]\n", + " signup_date: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 10.30variables/s]\n", + " stock: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 7.28variables/s]\n", + " product_id: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 7.09variables/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (0, 7)
idcustomer_idprice_paidproduct_idaddresscredit_card_nrsignup_date
i64i64stri64stri64date
" + ], + "text/plain": [ + "shape: (0, 7)\n", + "β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ id ┆ customer_id ┆ price_paid ┆ product_id ┆ address ┆ credit_card_nr ┆ signup_date β”‚\n", + "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", + "β”‚ i64 ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ date β”‚\n", + "β•žβ•β•β•β•β•β•ͺ═════════════β•ͺ════════════β•ͺ════════════β•ͺ═════════β•ͺ════════════════β•ͺ═════════════║\n", + "β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multiframe = MultiFrame.fit_dataframes(data, relations=[])\n", + "syn_data = multiframe.synthesize()\n", + "# Try to join the same tables\n", + "syn_data[\"purchases\"].join(syn_data[\"customers\"], left_on=\"customer_id\", right_on=\"id\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d56e9a2", + "metadata": {}, + "source": [ + "Note above that while the synthetic data has the same number of rows for the tables, the number of rows in the joined table is vastly different. This is because of the fact that the customer identifiers in the two tables are created independently." + ] + }, + { + "cell_type": "markdown", + "id": "f3694328", + "metadata": {}, + "source": [ + "### Synthesizing related tables\n", + "To remedy this, we can specify relations in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "088535ff", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " signup_date: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:01<00:00, 3.07variables/s]\n", + " stock: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 8.74variables/s]\n", + " product_id: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 9.17variables/s]\n", + " signup_date: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 15.32variables/s]\n", + " stock: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 9.84variables/s]\n", + " product_id: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 8.23variables/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (1_000, 7)
idcustomer_idprice_paidproduct_idaddresscredit_card_nrsignup_date
i64i64stri64stri64date
056633"$8.39"16180"Myself."30634100450221096401981-03-13
126638"$8.49"51487"Some miss television. Ok hour …12939164676701005811978-01-09
233109"$24,052.58"9851"Thank."36987936785759725361976-01-11
333108"$310.00"71339"Cultural speak simple."24562450921051491372005-10-03
418098"$53,934.07"31004"Camera moment likely career."20452887690616472041988-08-05
99526428"$0,178.56"31004"Always level benefit son then."46387628063911529801992-08-31
99661582"$496,310.37"70844"Behavior debate serious new we…28140138239069689391995-09-11
99741090"$5,767.78"37768"Population institution electio…21445806059521123371970-04-16
9984265"$767.73"4302"Win continue view product nece…47837398976265331222025-06-14
99945705"$59,292.29"33472"Pressure two name."25199083125240680702008-05-18
" + ], + "text/plain": [ + "shape: (1_000, 7)\n", + "β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + "β”‚ id ┆ customer_id ┆ price_paid ┆ product_id ┆ address ┆ credit_card_nr ┆ signup_date β”‚\n", + "β”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚\n", + "β”‚ i64 ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ date β”‚\n", + "β•žβ•β•β•β•β•β•ͺ═════════════β•ͺ═════════════β•ͺ════════════β•ͺ══════════════════β•ͺ══════════════════β•ͺ═════════════║\n", + "β”‚ 0 ┆ 56633 ┆ $8.39 ┆ 16180 ┆ Myself. ┆ 3063410045022109 ┆ 1981-03-13 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 640 ┆ β”‚\n", + "β”‚ 1 ┆ 26638 ┆ $8.49 ┆ 51487 ┆ Some miss ┆ 1293916467670100 ┆ 1978-01-09 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ television. Ok ┆ 581 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ hour … ┆ ┆ β”‚\n", + "β”‚ 2 ┆ 33109 ┆ $24,052.58 ┆ 9851 ┆ Thank. ┆ 3698793678575972 ┆ 1976-01-11 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ ┆ 536 ┆ β”‚\n", + "β”‚ 3 ┆ 33108 ┆ $310.00 ┆ 71339 ┆ Cultural speak ┆ 2456245092105149 ┆ 2005-10-03 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ simple. ┆ 137 ┆ β”‚\n", + "β”‚ 4 ┆ 18098 ┆ $53,934.07 ┆ 31004 ┆ Camera moment ┆ 2045288769061647 ┆ 1988-08-05 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ likely career. ┆ 204 ┆ β”‚\n", + "β”‚ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … β”‚\n", + "β”‚ 995 ┆ 26428 ┆ $0,178.56 ┆ 31004 ┆ Always level ┆ 4638762806391152 ┆ 1992-08-31 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ benefit son ┆ 980 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ then. ┆ ┆ β”‚\n", + "β”‚ 996 ┆ 61582 ┆ $496,310.37 ┆ 70844 ┆ Behavior debate ┆ 2814013823906968 ┆ 1995-09-11 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ serious new we… ┆ 939 ┆ β”‚\n", + "β”‚ 997 ┆ 41090 ┆ $5,767.78 ┆ 37768 ┆ Population ┆ 2144580605952112 ┆ 1970-04-16 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ institution ┆ 337 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ electio… ┆ ┆ β”‚\n", + "β”‚ 998 ┆ 4265 ┆ $767.73 ┆ 4302 ┆ Win continue ┆ 4783739897626533 ┆ 2025-06-14 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ view product ┆ 122 ┆ β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ nece… ┆ ┆ β”‚\n", + "β”‚ 999 ┆ 45705 ┆ $59,292.29 ┆ 33472 ┆ Pressure two ┆ 2519908312524068 ┆ 2008-05-18 β”‚\n", + "β”‚ ┆ ┆ ┆ ┆ name. ┆ 070 ┆ β”‚\n", + "β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relations = [\n", + " \"purchases[customer_id] SUBSET OF customers[id]\",\n", + " \"purchases[product_id] SUBSET OF products[id]\",\n", + "]\n", + "\n", + "multiframe_improved = MultiFrame.fit_dataframes(data, relations=relations)\n", + "syn_data_improved = multiframe_improved.synthesize()\n", + "syn_data_improved[\"purchases\"].join(syn_data_improved[\"customers\"], left_on=\"customer_id\", right_on=\"id\")" + ] + }, + { + "cell_type": "markdown", + "id": "99957e81", + "metadata": {}, + "source": [ + "In the presented table we only have SUBSET OF relations, but there are a few more:\n", + "\n", + "- `SUBSET OF`: Column a has values that are present in column b and can occur multiple times in column a.\n", + "- `EQUALS`: Column a has the same values as column b, but not necessarily in the same order. This also implies that the table of column a and the table of column b have the same number of rows.\n", + "- `EQUAL ORDERED`: Column a has the same values as column b and also in the same order. Also implies the tables have the same number of rows.\n", + "- `INFER FROM`: The relation between column a and b should be inferred by metasyn. This will result in one of the above relationships." + ] + }, + { + "cell_type": "markdown", + "id": "877d7606", + "metadata": {}, + "source": [ + "We can also adjust the size of the output tables for each individual table:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f377c09c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " signup_date: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 175.05variables/s]\n", + " stock: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 9.72variables/s]\n", + " product_id: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 8.00variables/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'customers': shape: (8, 4)\n", + " β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + " β”‚ id ┆ address ┆ credit_card_nr ┆ signup_date β”‚\n", + " β”‚ --- ┆ --- ┆ --- ┆ --- β”‚\n", + " β”‚ i64 ┆ str ┆ i64 ┆ date β”‚\n", + " β•žβ•β•β•β•β•β•β•β•ͺ═════════════════════════════════β•ͺ═════════════════════β•ͺ═════════════║\n", + " β”‚ 29629 ┆ Administration special throw c… ┆ 1481554008638341506 ┆ 1989-02-10 β”‚\n", + " β”‚ 59997 ┆ Nearly my debate here. ┆ 959730128663445303 ┆ 2018-03-13 β”‚\n", + " β”‚ 22318 ┆ Site reach everybody. Apply ra… ┆ 1912931111910970411 ┆ 2001-10-28 β”‚\n", + " β”‚ 70412 ┆ Really although my this four t… ┆ 1166320154871006193 ┆ 1985-10-03 β”‚\n", + " β”‚ 54554 ┆ Challenge science meeting seco… ┆ 4170223632403772682 ┆ 2005-11-02 β”‚\n", + " β”‚ 63113 ┆ Spend mean rock. ┆ 2708289156331850163 ┆ 2004-01-21 β”‚\n", + " β”‚ 38693 ┆ Wonder. ┆ 3188382574585536709 ┆ 1973-03-11 β”‚\n", + " β”‚ 66739 ┆ Daughter continue example pare… ┆ 222854555833926653 ┆ 1984-03-24 β”‚\n", + " β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜,\n", + " 'products': shape: (500, 4)\n", + " β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”\n", + " β”‚ id ┆ name ┆ current_price ┆ stock β”‚\n", + " β”‚ --- ┆ --- ┆ --- ┆ --- β”‚\n", + " β”‚ i64 ┆ str ┆ str ┆ i64 β”‚\n", + " β•žβ•β•β•β•β•β•β•β•ͺ═══════════β•ͺ═══════════════β•ͺ═══════║\n", + " β”‚ 72576 ┆ mgxlxrj ┆ $30.56 ┆ 6 β”‚\n", + " β”‚ 65586 ┆ tlua ┆ $990.89 ┆ 1 β”‚\n", + " β”‚ 38430 ┆ rpcdjm ┆ $54,036.64 ┆ 2 β”‚\n", + " β”‚ 56925 ┆ twmubhzlw ┆ $69,909.56 ┆ 6 β”‚\n", + " β”‚ 24299 ┆ wz ┆ $1.54 ┆ 4 β”‚\n", + " β”‚ … ┆ … ┆ … ┆ … β”‚\n", + " β”‚ 76674 ┆ tpsv ┆ $8.40 ┆ 4 β”‚\n", + " β”‚ 42453 ┆ knevjb ┆ $5.86 ┆ 6 β”‚\n", + " β”‚ 63006 ┆ uub ┆ $305,351.06 ┆ 5 β”‚\n", + " β”‚ 41397 ┆ xft ┆ $12.82 ┆ 8 β”‚\n", + " β”‚ 70220 ┆ mohqwra ┆ $7.88 ┆ 7 β”‚\n", + " β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”˜,\n", + " 'purchases': shape: (1_000, 4)\n", + " β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”\n", + " β”‚ id ┆ customer_id ┆ price_paid ┆ product_id β”‚\n", + " β”‚ --- ┆ --- ┆ --- ┆ --- β”‚\n", + " β”‚ i64 ┆ i64 ┆ str ┆ i64 β”‚\n", + " β•žβ•β•β•β•β•β•ͺ═════════════β•ͺ═════════════β•ͺ════════════║\n", + " β”‚ 0 ┆ 22318 ┆ $202,778.76 ┆ 61507 β”‚\n", + " β”‚ 1 ┆ 38693 ┆ $644.28 ┆ 68259 β”‚\n", + " β”‚ 2 ┆ 29629 ┆ $2,546.96 ┆ 41446 β”‚\n", + " β”‚ 3 ┆ 59997 ┆ $57,793.70 ┆ 19989 β”‚\n", + " β”‚ 4 ┆ 59997 ┆ $772.01 ┆ 27698 β”‚\n", + " β”‚ … ┆ … ┆ … ┆ … β”‚\n", + " β”‚ 995 ┆ 54554 ┆ $025,931.06 ┆ 81569 β”‚\n", + " β”‚ 996 ┆ 66739 ┆ $93.99 ┆ 26509 β”‚\n", + " β”‚ 997 ┆ 63113 ┆ $136,217.96 ┆ 56594 β”‚\n", + " β”‚ 998 ┆ 63113 ┆ $7.95 ┆ 39566 β”‚\n", + " β”‚ 999 ┆ 70412 ┆ $0.22 ┆ 39954 β”‚\n", + " β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multiframe_improved.synthesize(n={\"customers\": 8})" + ] + }, + { + "cell_type": "markdown", + "id": "7ced986f", + "metadata": {}, + "source": [ + "### Inspecting multiframes\n", + "\n", + "You can inspect multiframes with a print statement." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f8d6df6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table customers:\n", + " Number of rows: 200\n", + " Number of columns: 4\n", + " Columns: id, address, credit_card_nr, signup_date\n", + "\n", + "Table products:\n", + " Number of rows: 500\n", + " Number of columns: 4\n", + " Columns: id, name, current_price, stock\n", + "\n", + "Table purchases:\n", + " Number of rows: 1000\n", + " Number of columns: 4\n", + " Columns: id, customer_id, price_paid, product_id\n", + "\n", + "Relations between columns:\n", + " purchases[customer_id] SUBSET OF customers[id]\n", + " purchases[product_id] SUBSET OF products[id]\n", + "\n" + ] + } + ], + "source": [ + "print(multiframe_improved)" + ] + }, + { + "cell_type": "markdown", + "id": "ea694208", + "metadata": {}, + "source": [ + "You can select and inspect metaframes (representations of the individual tables) with brackets `[]`:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bd2e1c78", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Rows: 1000\n", + "# Columns: 4\n", + "\n", + "Column 1: \"id\"\n", + "- Variable Type: discrete\n", + "- Data Type: Int64\n", + "- Proportion of Missing Values: 0.0000\n", + "- Distribution:\n", + "\t- Type: core.unique_key\n", + "\t- Parameters:\n", + "\t\t- lower: 0\n", + "\t\t- consecutive: True\n", + "\t\n", + "\n", + "Column 2: \"customer_id\"\n", + "- Variable Type: discrete\n", + "- Data Type: Int64\n", + "- Proportion of Missing Values: 0.0000\n", + "- Distribution:\n", + "\t- Type: core.multinoulli\n", + "\t- Parameters:\n", + "\t\t- labels: [ 216 538 1634 1869 3109 4186 4220 5936 6422 6931 7065 7080\n", + "\t 8198 8442 8579 8714 9142 9175 9720 9738 9775 9827 10383 11558\n", + "\t 11568 12034 12154 13065 13066 13169 13182 13535 14037 14278 14846 15318\n", + "\t 16097 16216 16853 17149 17518 17998 18662 18727 19211 19884 19895 20062\n", + "\t 20731 21156 21330 21453 21670 22057 22363 23038 23486 24366 24901 24955\n", + "\t 25571 25838 26804 27122 27521 27958 28163 28257 28375 28418 28699 28760\n", + "\t 29023 29313 29315 29586 29809 30968 31278 31666 31770 32125 32862 33416\n", + "\t 33520 33954 34271 34983 35106 35147 35996 36001 37428 37545 37644 37659\n", + "\t 38064 38111 39630 39707 41080 41730 41864 42057 42337 42679 42779 42839\n", + "\t 43669 43725 44382 44905 45401 45941 46346 46708 46754 48110 48147 48937\n", + "\t 48954 49018 49041 49833 49876 50169 50314 50554 50830 50847 51257 51279\n", + "\t 51427 51451 51471 51503 51991 52098 52232 52396 52904 52930 53299 53301\n", + "\t 53572 53735 54591 54694 55472 56855 56876 56991 57256 57304 57543 57544\n", + "\t 57647 57809 58050 58775 59360 59386 60213 60357 60736 60762 60826 60985\n", + "\t 61274 61591 61772 62429 62737 62805 62926 62971 63322 63901 63994 64030\n", + "\t 64431 64448 66228 67458 67787 67873 68751 68765 68862 69118 69158 70515\n", + "\t 70669 70682 71286 72970 73006]\n", + "\t\t- probs: [0.004 0.01 0.005 0.004 0.005 0.003 0.005 0.009 0.009 0.006 0.002 0.005\n", + "\t 0.002 0.005 0.008 0.005 0.01 0.006 0.007 0.003 0.001 0.005 0.005 0.005\n", + "\t 0.008 0.004 0.005 0.007 0.006 0.009 0.005 0.004 0.006 0.006 0.004 0.005\n", + "\t 0.001 0.002 0.005 0.003 0.009 0.005 0.007 0.006 0.005 0.004 0.005 0.005\n", + "\t 0.006 0.005 0.004 0.007 0.004 0.003 0.004 0.004 0.004 0.008 0.004 0.01\n", + "\t 0.006 0.002 0.005 0.004 0.004 0.006 0.005 0.004 0.001 0.006 0.003 0.002\n", + "\t 0.007 0.004 0.003 0.005 0.003 0.001 0.005 0.005 0.011 0.003 0.006 0.003\n", + "\t 0.005 0.006 0.003 0.005 0.005 0.007 0.005 0.006 0.006 0.003 0.003 0.006\n", + "\t 0.007 0.008 0.003 0.007 0.005 0.008 0.005 0.008 0.003 0.008 0.004 0.008\n", + "\t 0.006 0.004 0.01 0.007 0.009 0.002 0.004 0.005 0.004 0.008 0.003 0.009\n", + "\t 0.005 0.005 0.006 0.005 0.007 0.006 0.004 0.006 0.002 0.005 0.002 0.003\n", + "\t 0.003 0.006 0.006 0.003 0.007 0.008 0.003 0.006 0.006 0.004 0.004 0.007\n", + "\t 0.005 0.005 0.003 0.007 0.004 0.007 0.003 0.003 0.005 0.004 0.005 0.002\n", + "\t 0.006 0.006 0.005 0.005 0.007 0.003 0.002 0.004 0.001 0.002 0.004 0.006\n", + "\t 0.007 0.005 0.006 0.001 0.002 0.005 0.009 0.004 0.005 0.006 0.004 0.004\n", + "\t 0.005 0.008 0.005 0.005 0.003 0.006 0.004 0.003 0.01 0.007 0.003 0.008\n", + "\t 0.005 0.004 0.008 0.003 0.005]\n", + "\t\n", + "\n", + "Column 3: \"price_paid\"\n", + "- Variable Type: string\n", + "- Data Type: String\n", + "- Proportion of Missing Values: 0.0000\n", + "- Distribution:\n", + "\t- Type: core.regex\n", + "\t- Parameters:\n", + "\t\t- regex: \\$[0-9]{1,3}(|,[0-9]{3})\\.[0-9]{2}\n", + "\t\n", + "\n", + "Column 4: \"product_id\"\n", + "- Variable Type: discrete\n", + "- Data Type: Int64\n", + "- Proportion of Missing Values: 0.0000\n", + "- Distribution:\n", + "\t- Type: core.uniform\n", + "\t- Parameters:\n", + "\t\t- lower: 369\n", + "\t\t- upper: 85294\n", + "\t\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(multiframe_improved[\"purchases\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c3291964", + "metadata": {}, + "source": [ + "### Saving and loading multiframes\n", + "\n", + "Similar to metaframes, multiframes can also be saved and loaded from a .json file. This .json file is a GMF (Generative Metadata Format) file that has the same structure as when the metadata of single tables are stored." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "261bc49d", + "metadata": {}, + "outputs": [], + "source": [ + "multiframe.save_json(\"test.json\")\n", + "mf = multiframe.load_json(\"test.json\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "3.12.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/metasyn/__init__.py b/metasyn/__init__.py index 9876a878..ab56709c 100644 --- a/metasyn/__init__.py +++ b/metasyn/__init__.py @@ -21,7 +21,7 @@ from importlib.metadata import version from metasyn import distribution, file, privacy -from metasyn.demo.dataset import demo_dataframe, demo_file +from metasyn.demo.dataset import demo_data, demo_dataframe, demo_file from metasyn.distribution.base import metadist from metasyn.file import ( read_csv, @@ -36,13 +36,16 @@ write_tsv, ) from metasyn.metaframe import MetaFrame +from metasyn.multiframe import MultiFrame from metasyn.var import MetaVar from metasyn.varspec import VarSpec __all__ = [ "MetaVar", "MetaFrame", + "MultiFrame", "demo_file", + "demo_data", "demo_dataframe", "metadist", "VarSpec", diff --git a/metasyn/__main__.py b/metasyn/__main__.py index e5b7e3e6..43d63fd6 100644 --- a/metasyn/__main__.py +++ b/metasyn/__main__.py @@ -14,7 +14,7 @@ from metasyn import MetaFrame from metasyn.config import MetaConfig from metasyn.file import file_interface_from_dict, get_file_interface_class, read_file -from metasyn.validation import create_schema +from metasyn.gmf import GmfV20Parser EXAMPLE_CREATE_META="metasyn create-meta your_dataset.csv -o your_gmf_file.json --config your_config.toml" # noqa: E501 EXAMPLE_CREATE_TOML="metasyn create-meta your_dataset.csv -o your_gmf_file.toml --config your_config.toml" # noqa: E501 @@ -265,7 +265,7 @@ def schema(input_args) -> None: f"\n Available plugins: {pl_avail}" ) parser.error(errmsg) - jsonschema = create_schema(list(plugins)) + jsonschema = GmfV20Parser().create_schema(list(plugins)) if args.output is None: print(json.dumps(jsonschema, indent=4)) else: diff --git a/metasyn/demo/__init__.py b/metasyn/demo/__init__.py index 35a2d05d..d126a7fe 100644 --- a/metasyn/demo/__init__.py +++ b/metasyn/demo/__init__.py @@ -1,5 +1,5 @@ """Package to create and retrieve demo datasets used in tutorials.""" -from metasyn.demo.dataset import demo_dataframe, demo_file +from metasyn.demo.dataset import demo_data, demo_dataframe, demo_file -__all__ = ["demo_file", "demo_dataframe"] +__all__ = ["demo_file", "demo_dataframe", "demo_data"] diff --git a/metasyn/demo/dataset.py b/metasyn/demo/dataset.py index 346ac627..57fa4d2e 100644 --- a/metasyn/demo/dataset.py +++ b/metasyn/demo/dataset.py @@ -2,11 +2,13 @@ # import random import string +import warnings from abc import ABC, abstractmethod from datetime import date, datetime, time, timedelta from importlib.resources import files from pathlib import Path +import faker import numpy as np import polars as pl @@ -37,6 +39,9 @@ def name(self): def file_location(self): return files(__package__) / f"demo_{self.name}.csv" + def get_data(self): + return self.get_dataframe() + def get_dataframe(self): return pl.read_csv(self.file_location, schema_overrides=self.schema, try_parse_dates=True) @@ -50,6 +55,32 @@ def var_specs(self): return [] +class BaseMultiDataset(BaseDataset): + """Abstract class to define a dataset with multiple tables.""" + + def get_data(self): + """Alias for get_dataframes().""" + return self.get_dataframes() + + @property + @abstractmethod + def file_location(self): + pass + + def get_dataframes(self): + """Create the dataframes (from file for example). + + Returns + ------- + dataframes: + Dictionary with dataframes. + """ + return {name: pl.read_csv(path, schema_overrides=self.schema, try_parse_dates=True) + for name, path in self.file_location.items()} + + def get_dataframe(self): + return self.get_dataframes() + @register class TitanicDataset(BaseDataset): """Included in pandas, but post-processed to contain more columns.""" @@ -274,6 +305,70 @@ def create(cls, csv_file): # Write to a csv file pl.DataFrame(all_series).write_csv(csv_file) +@register +class ShopMultiDataset(BaseMultiDataset): + """An example dataset containing customers, products and purchases.""" + + @property + def name(self): + return "shop_multi" + + @property + def schema(self): + return {} + + @property + def file_location(self): + return { + "customers": files(__package__) / self.name / "customers.csv", + "products": files(__package__) / self.name / "products.csv", + "purchases": files(__package__) / self.name / "purchases.csv", + } + + @classmethod + def create(cls, data_dir: Path, n_user: int = 200, n_product: int = 500, + n_purchases: int = 1000): + user_ids = np.unique(np.random.randint(123, 123456+100*n_user, size=2*n_user))[:n_user] + np.random.shuffle(user_ids) + + product_ids = np.unique(np.random.randint(123, 123456+100*n_product, size=2*n_product) + )[:n_product] + np.random.shuffle(product_ids) + + fake = faker.Faker() + df_customers = pl.DataFrame( + { + "id": user_ids, + "address": [fake.address() for _ in range(n_user)], + "credit_card_nr": [fake.credit_card_number() for _ in range(n_user)], + "signup_date": [fake.date() for _ in range (n_user)] + } + ) + df_customers + + df_products = pl.DataFrame( + { + "id": product_ids, + "name": [fake.word() for _ in range(n_product)], + "current_price": [fake.pricetag() for _ in range(n_product)], + "stock": np.random.randint(0, 10, size=n_product) + } + ) + + df_purchases = pl.DataFrame( + { + "id": np.arange(n_purchases), + "customer_id": df_customers["id"].sample(n_purchases, with_replacement=True, + shuffle=True), + "price_paid": [fake.pricetag() for _ in range(n_purchases)], + "product_id": df_products["id"].sample(n_purchases, with_replacement=True, + shuffle=True), + } + ) + df_products.write_csv(data_dir / "products.csv") + df_purchases.write_csv(data_dir / "purchases.csv") + df_customers.write_csv(data_dir / "customers.csv") + def _get_demo_class(name): if name in _AVAILABLE_DATASETS: @@ -286,13 +381,16 @@ def _get_demo_class(name): def demo_file(name: str = "titanic") -> Path: """Get the path for a demo data file. - There are six options: + There are eight options: - titanic (Included in pandas, but post-processed to contain more columns) - spaceship (CC-BY from https://www.kaggle.com/competitions/spaceship-titanic) - synthea_imaging (CC-BY from https://synthea.mitre.org/downloads) - fruit (very basic example data from Polars) - survey (columns from ESS round 11 Human Values Scale questionnaire for the Netherlands) - test (columns with all supported data types) + - hospital (Example electronic health record hospital dataset) + - druguse (Example dataset with answers to an open question on study participants' + daily drug use) Arguments --------- @@ -312,21 +410,24 @@ def demo_file(name: str = "titanic") -> Path: return _get_demo_class(name).file_location -def demo_dataframe(name: str = "titanic") -> pl.DataFrame: +def demo_data(name: str = "titanic") -> pl.DataFrame: """Get a demonstration dataset as a prepared polars dataframe. - There are six options: + There are eight options: - titanic (Included in pandas, but post-processed to contain more columns) - spaceship (CC-BY from https://www.kaggle.com/competitions/spaceship-titanic) - synthea_imaging (CC-BY from https://synthea.mitre.org/downloads) - fruit (very basic example data from Polars) - survey (columns from ESS round 11 Human Values Scale questionnaire for the Netherlands) - test (columns with all supported data types) + - hospital (Example electronic health record hospital dataset) + - druguse (Example dataset with answers to an open question on study participants' + daily drug use) Arguments --------- name: - Name of the demo dataset: spaceship, fruit, or titanic. + Name of the demo dataset. Returns ------- @@ -338,4 +439,11 @@ def demo_dataframe(name: str = "titanic") -> pl.DataFrame: file, edition 1.0 [Data set]. Sikt - Norwegian Agency for Shared Services in Education and Research. https://doi.org/10.21338/ess11e01_0 """ - return _get_demo_class(name).get_dataframe() + return _get_demo_class(name).get_data() + +def demo_dataframe(name: str = "titanic") -> pl.DataFrame: + """Legacy alias for demo_data.""" + warnings.warn("The function demo_dataframe is deprecated in favor of demo_data.", + DeprecationWarning, + stacklevel=2) + return demo_data(name) diff --git a/metasyn/demo/shop_multi/customers.csv b/metasyn/demo/shop_multi/customers.csv new file mode 100644 index 00000000..6af132f9 --- /dev/null +++ b/metasyn/demo/shop_multi/customers.csv @@ -0,0 +1,401 @@ +id,address,credit_card_nr,signup_date +21330,"55036 Buchanan Loaf Apt. 324 +Stephaniefort, GA 65575",372318430068714,1978-04-15 +28257,"30746 Erin Villages Suite 203 +South Nicolefurt, MS 96250",4835573590606,2009-01-07 +52098,"Unit 5586 Box 2705 +DPO AA 89973",2251023388363453,1979-09-29 +42337,"9137 Christina Square +South Kimstad, WY 63878",4669952031417658,2016-09-11 +67458,"3047 Harris Mall Apt. 278 +Oscarberg, MT 30361",36950970004382,1970-03-05 +60762,"0699 Fox Creek +Hayneshaven, PR 32996",4253643923569,2018-11-27 +66228,"20642 Andrew Springs +Changfurt, ME 87257",3588217506423627,2014-08-02 +51471,"545 Christian Orchard +Port Andrewfort, HI 42818",4757308338434,1970-07-16 +13065,"84311 Clay Walks +West Mariaton, AK 53991",4043850906948634865,2023-03-20 +67873,"USNS Vaughn +FPO AA 53396",3596228036958099,2004-03-29 +41864,"9792 Campbell Rapids +East Leah, VI 22461",2249418058674707,1977-03-07 +43725,"6159 Rivas Stravenue +Natashamouth, UT 62650",6583331059327750,1977-12-05 +56876,"9103 Torres Lodge Suite 679 +Port Karinafurt, AR 47179",372865677230878,1993-03-21 +21453,"USNV Pitts +FPO AA 24797",4899945905496,2006-04-19 +68862,"63464 Phillips Lake +Keithview, IL 40934",502053580606,2002-01-24 +37545,"479 Trujillo Drive +New Scott, MO 34759",4698122098699,2002-09-21 +59386,"1737 Keith Locks +Danielview, NC 59141",676104267817,1970-05-06 +67787,"0726 James Island +East James, NJ 95132",6011304546558321,2002-05-01 +34983,"537 Ward Squares Apt. 198 +Matthewport, ND 74468",38569233638949,2009-11-20 +57256,"268 Benson Well Apt. 597 +Port Jessicaville, NJ 36403",2521214626877438,2017-03-28 +17518,"32241 Thomas Underpass +Andreaville, MH 37656",3546106481007359,2018-09-18 +23038,"93999 Kathleen Overpass Suite 100 +Riversburgh, NC 53987",4563163565810162568,1989-02-23 +12034,"10535 Donna Shore Apt. 119 +West Jessica, DE 70257",639074052577,2014-11-20 +49833,"06520 Kelley Expressway +Marshallhaven, NH 28983",4798027954774833315,2016-03-29 +62926,"524 Linda Ways Apt. 167 +East Yvonnetown, IL 39052",30212224616297,2017-01-06 +14846,"346 Laurie Lodge +New Chad, LA 48517",30431827532228,2022-12-16 +56855,"6730 Andre Run Apt. 414 +Michelleport, WI 27427",4107421042139321,1980-10-24 +56991,"358 Michelle River Apt. 892 +New Michelleville, AR 20718",675986754173,1989-02-17 +52904,"8067 Jessica Garden Suite 208 +Gregoryview, KS 05130",341040253198091,2019-01-10 +70682,"2313 Anthony Isle +South Thomasmouth, AZ 86754",3547164185713086,2003-02-11 +59360,"0993 Gamble Station +Valeriehaven, IL 25600",3557205174052635,2023-09-21 +28375,"31365 Macias Fork +Jonathanhaven, OR 57100",213151179924307,2018-11-12 +44382,"435 Steven Dam +Bradleyland, ME 33363",3536912648407289,1994-07-26 +73006,"PSC 6456, Box 8429 +APO AE 72996",213184556193378,2007-02-10 +17998,"2098 Johnson Fort Suite 795 +Coleport, PR 56358",4123156988024414,1986-05-14 +28699,"13170 Scott Course Apt. 480 +Barnesfort, HI 89535",3510346221383445,2018-08-31 +8442,"335 Griffin Club Suite 572 +South William, AZ 21630",4928695758041428,1985-01-26 +31770,"06391 Hicks Loaf +West Aprilton, MI 77853",4301464870979,2001-10-17 +4220,"431 Valdez Hills Apt. 358 +North Kimberlyhaven, VA 38630",4003423572734,1987-12-07 +52232,"4764 Miranda Parkway +Davidsonborough, FL 87543",180065934422051,2016-11-30 +57544,"000 Nicholas Shoal +West Markstad, DE 36679",4430458397481,1999-06-01 +42057,"3299 James Pines +Hayesmouth, NV 57928",345499040046425,2001-12-07 +31278,"8182 Stanton Key +Lake Courtneychester, AR 21140",6011251157715106,1998-02-06 +61591,"PSC 9908, Box 7062 +APO AP 45374",3557159147651520,2015-10-04 +11568,"Unit 2258 Box 9804 +DPO AE 69809",3530739762426860,1998-12-09 +48110,"69021 Nelson Spur +East John, CA 14073",4357547109952961,1981-04-23 +49041,"4587 Elizabeth Centers Apt. 414 +Millerborough, NY 24177",4715884778212436,1995-10-16 +18662,"PSC 0276, Box 7326 +APO AP 29014",374504826169244,2017-09-11 +37644,"598 Roberts Spur +Debraville, MO 05995",4206363427491796,2006-10-18 +23486,"238 Griffin Pines Apt. 681 +South Lisa, VT 22509",38871896575263,1972-08-08 +57543,"15023 Hoffman Mission Apt. 705 +Lake Morgan, NM 76798",4765656234921917817,2009-11-26 +35147,"20777 Gilbert Forges Apt. 861 +Gregport, PA 72559",30269314844684,2024-04-23 +27521,"USCGC Dixon +FPO AP 50144",213163188222649,2000-11-30 +538,"6592 Jessica Knolls +Austinport, AL 76479",6567947362888038,1972-03-29 +14278,"4472 Graves Crossroad Suite 609 +Birdport, WY 45785",4235532922638813,1985-06-15 +46754,"USS Caldwell +FPO AA 60695",4019233846967837080,2001-11-12 +64448,"454 Williams Motorway +North Joshua, TN 06866",4900512348878,1994-04-16 +50169,"48219 Patrick Springs +Martinezfurt, VT 21200",4408300228062,1977-09-19 +39630,"USCGC Haley +FPO AP 83181",3566991235875523,2010-10-15 +29313,"77659 Connor Street +West Bonnieton, LA 70751",2720167370624381,2007-02-01 +52930,"4146 Madison Lodge +East Anthony, OR 35647",2381199027683452,2014-09-04 +21156,"6259 Rebecca Forest Apt. 451 +Robertoburgh, IN 47995",180045771432751,2017-07-11 +51257,"81169 Elizabeth Route Apt. 612 +New Matthew, TN 51834",213180190423239,1979-12-24 +19884,"Unit 6384 Box 6824 +DPO AP 26917",6011924823539858,2017-06-25 +16097,"6684 Terri Harbors +South Allen, RI 34745",4199035934635056042,1990-07-15 +42839,"0600 Hector Hills +Lake Stephenborough, KS 48993",5120028917019528,2002-03-31 +7065,"162 Randall Parkways +New Teresachester, KS 68497",4114604128279068,1982-07-17 +53301,"Unit 3079 Box 9887 +DPO AE 94891",3576045040527718,1979-06-08 +24366,"14145 Vargas Club +Theresaburgh, SC 99225",6011499829608486,2019-05-15 +5936,"5378 Jesse Brooks +Lake Robert, TN 87773",30350963282176,1995-09-04 +45941,"453 Sarah Mission Apt. 524 +New Michelle, NM 35057",180007402307414,2013-08-20 +58775,"0406 James Common +Sophiaport, SD 35734",4491309629946370,2023-09-14 +41730,"121 Mullins Drive Apt. 966 +Port Jacob, MA 53655",3546521350385662,1986-06-26 +22363,"469 Garner Orchard +South Josephfurt, NV 46344",4666084049899132,2007-07-03 +63322,"068 Martin Square Apt. 009 +Robertfort, VI 71176",378433798142128,2011-11-01 +29586,"752 Joanne Club +South Melissa, LA 45651",2262600144259070,1997-08-04 +37659,"USCGC Wallace +FPO AP 59576",4200791293337,1995-01-29 +42779,"413 Emily Pass Apt. 778 +North Kathleen, OH 36421",4231386245671,1971-02-26 +10383,"022 Smith Road +East Wendymouth, VA 55159",348254026698336,1990-09-07 +38111,"672 Jerry Harbors +East Roytown, GA 12766",3515152728065310,1984-08-04 +13182,"83393 Colton Mount Suite 210 +South Alexanderchester, NV 18665",30105202558889,2016-08-24 +51991,"0795 Russell Glens Suite 545 +South Shawna, IN 62890",4524473432842944,1989-02-27 +55472,"079 Morton Divide +East Christina, WI 59130",4184299796716825,1985-09-25 +3109,"85529 Ballard Parks +Marquezhaven, IL 97969",30562915508231,1978-05-08 +20062,"675 Singh Crescent +New Brandymouth, MD 89942",2720397374299779,1981-06-09 +54591,"13206 Margaret Knoll Apt. 754 +Jonesmouth, VI 94993",2700933528215859,2001-03-25 +60357,"415 Jefferson Radial +Joelhaven, WY 08097",6011669218289201,2011-09-22 +37428,"205 Coleman Loop +Elizabethside, FM 87320",6539271242406753,1983-06-26 +71286,"835 Martha Ports Apt. 556 +Lake Rebecca, MI 14544",341191829192868,2003-11-09 +51279,"8676 Harris Walks Suite 639 +Riverafurt, WY 04053",4830544739796,2025-06-19 +69118,"34143 Ross Roads Apt. 899 +Malloryshire, PR 98500",4835013956720458,1987-07-30 +29023,"60985 Klein Forge Suite 789 +West David, FL 92704",2239024575767149,1996-11-23 +13535,"40952 Kevin Radial +Haydenshire, IN 17387",6586893394130667,2019-05-27 +63994,"06981 Thomas Inlet +Lake Scott, KS 52710",3531320405035308,1987-01-15 +35996,"6329 Chan Alley +Reyesburgh, SD 58009",4723440056299033343,1973-12-19 +50554,"2200 Solis Mountains Apt. 469 +East Michael, VT 33672",4929587072093553986,2000-09-02 +49018,"3823 Heidi Parkways +Lemouth, DE 58198",4406914077497018216,1993-04-11 +50847,"3013 Brandon Crossroad Suite 229 +South Darren, SC 69235",180088960558283,2003-11-02 +4186,"1098 Daniel Crossing Suite 799 +Houseton, PW 06887",4443061757795509263,1974-08-11 +35106,"841 Guzman Cliff +Brooksmouth, RI 46589",060464104367,1970-05-10 +36001,"778 Kristie Fork Apt. 550 +Khanborough, MA 34019",4485186377403,2006-11-28 +39707,"75892 Cody Haven Suite 201 +Jackiestad, PR 63265",501831701526,1972-09-14 +33520,"5207 Lambert Crossroad +Mannville, WA 54384",30190502768552,2020-02-09 +43669,"8382 Cantrell Viaduct +Leeside, FL 96954",3522123112488336,1983-02-25 +51427,"455 Fernandez Cliff Apt. 823 +West Robertport, IA 93505",3593050495294300,2025-03-31 +66301,"USS Parker +FPO AA 08950",4283624310074718,2022-11-05 +58050,"305 Carey Village +Lake Brian, MD 75564",3508838528991635,2015-06-05 +24793,"USCGC Frazier +FPO AA 71380",4940523352007344,2013-01-08 +64431,"57446 Sarah Greens Suite 513 +South Diane, MD 91701",3593211778626200,1986-12-10 +44905,"796 Phillips Stream +Fisherbury, NM 39943",4343834364280676778,2011-12-18 +49876,"84771 Walker Freeway +Kaufmanview, OR 85939",4671459272372510300,1994-05-30 +22057,"149 Gene Via +Port Mark, ID 79915",3596373548370613,2022-06-30 +60736,"720 Natalie Dam Apt. 388 +Andreaport, AK 00965",213172774992143,2024-03-04 +6422,"Unit 4454 Box 2003 +DPO AP 60446",3572007674615908,2020-08-27 +57304,"09717 Deborah Branch Suite 745 +Port Erinton, MD 70769",639095665563,2023-07-29 +61274,"249 Douglas Shoal +New Keithstad, IA 51621",6011198701557205,1991-03-07 +68765,"8589 Morales Squares +Dawsonville, VI 08832",30153769158305,2002-04-06 +1869,"07085 Wyatt Ports Suite 206 +Shannonfort, MT 43416",4041924258552063478,1978-03-03 +57809,"0062 Ayers View Suite 421 +Reedmouth, AR 61031",4239534911249873640,1994-08-03 +9738,"6466 Belinda Square +South Patrickview, KS 48292",3591167986557680,2006-04-09 +13066,"3168 Green Centers +Rayport, CO 61310",30266352579107,2024-04-29 +27958,"2331 Bradley Cliffs Apt. 998 +Brittanyborough, IA 18788",4979685382112,2023-07-30 +8198,"48061 Nolan Ridge +West Ebony, AS 30101",4924777977912912,1998-03-05 +11558,"37413 Clark Pines +Port Erinfurt, PA 12269",4276676565858,2024-11-25 +48937,"7736 Padilla Shoals Apt. 409 +Jonesfurt, OH 66173",3552351741271293,1973-07-29 +26804,"79675 Keller Radial Suite 791 +Holmesfurt, PW 32387",4197977591097872,1996-04-21 +9775,"USS Bryant +FPO AP 80275",630493861789,1988-11-22 +70515,"808 Christopher Green Suite 170 +South Regina, WY 23207",4672865576658,1989-11-22 +60985,"130 Andrea Shores Apt. 325 +Keyport, ND 76382",4438570987168305,2020-09-21 +3617,"856 Walker Mall +Hodgesborough, KS 99211",4289030147038807,1990-10-08 +9827,"043 Linda Overpass Apt. 757 +Kempborough, KY 99668",4982957391604308,2018-04-04 +53299,"76220 Pollard Port Apt. 357 +Griffinland, WI 30264",213126956643486,2022-04-29 +42679,"192 Mueller Points Suite 631 +East Darrell, AK 88618",4536218127955062888,1979-08-27 +72970,"3784 Melanie Tunnel +Georgeburgh, OH 74389",6011332131578966,1996-06-08 +28163,"85623 Donald Station Apt. 463 +New Jadehaven, AZ 34165",213174148727536,1977-03-25 +32125,"10471 Moran Mountain Apt. 907 +West Rebecca, AS 59780",3520595563853260,2003-06-22 +16853,"48981 Linda Trafficway Apt. 280 +Grimesfurt, KS 81885",371705032256278,1995-11-22 +28418,"94482 Wells Route Apt. 487 +New Julian, AZ 81215",6565128977198335,2004-02-26 +19211,"42059 Bartlett Brook +New Joseburgh, NE 30881",4000278109141,1995-05-06 +12154,"Unit 0441 Box 7381 +DPO AE 61591",5359180053188815,2019-06-05 +216,"73990 Justin River Apt. 715 +Arellanoville, CA 90553",6011302347181434,1994-03-26 +60213,"Unit 5440 Box 9982 +DPO AP 75601",4515349484810578,2019-04-30 +62971,"306 Howard Pine +Loribury, CA 03649",4182257706360805,2006-05-20 +6931,"960 Aguilar Roads +West Theresamouth, ID 18029",4143119901423,1973-08-08 +41080,"231 Butler Spur +Glennstad, MA 77962",2712342894287862,1987-05-09 +46346,"634 Irwin Circle +Bethanymouth, KY 44551",4543980095883046,2005-01-06 +53572,"995 Ashley River Apt. 462 +North Ellenmouth, VI 20118",502053896390,1983-01-08 +30968,"86718 David Greens Apt. 898 +West Margaretland, OH 18305",371247018731222,1980-12-22 +14037,"Unit 8961 Box 1274 +DPO AA 46337",4875506648447209,2016-03-04 +34271,"424 Robert Turnpike +Carlsonchester, NE 37851",4958911109989915,2014-11-25 +48954,"49646 Williams Parkways +Jamesfort, ID 80429",5141415896938049,1991-08-31 +32862,"5941 Jonathan Roads Apt. 916 +Martinezshire, OH 55948",3564976134444028,1998-03-14 +64030,"618 Edward Vista Suite 611 +North Carrie, NC 41253",4136263449785,1995-06-01 +29315,"19520 Kelsey Pines Apt. 333 +Sandraland, CA 48720",3513901865762994,1997-08-23 +60826,"62289 Randolph Island Apt. 655 +South Erica, NE 35540",3550137764052062,1980-10-21 +62429,"2392 Donna Points +North Amyside, WI 19666",4281668633482,2006-11-11 +25571,"5645 Alison Place Apt. 306 +East Matthewport, WY 75560",3505857308879250,1970-05-11 +28760,"49150 Baker Court +North Anthonyfurt, VA 38140",371688036345497,1981-07-14 +19895,"1981 Werner Pines Suite 888 +New Julie, CO 40340",502056883320,2015-11-11 +61772,"8636 Gibson Valley +Lindaburgh, OK 62664",373777287334223,1983-09-13 +25838,"95227 Tate Hill +Williamstown, NC 34108",4519423277080461,2018-11-18 +51451,"1074 Perry Island +South Tracy, CA 99219",180031863795677,1979-04-11 +50314,"745 Destiny Turnpike +South Jamesshire, WY 84377",3520711950081684,1986-07-25 +9720,"284 Karen Street +Daltonton, OH 63821",4466877648760072,1979-05-15 +29809,"6416 Jackson Crossroad +Brucemouth, OR 30953",374514330248871,1971-07-31 +8579,"68997 Kevin Summit +Lake Veronica, TX 67939",4103619762598105,1980-01-02 +46708,"USNV Wilcox +FPO AP 00798",6589574296328362,1980-11-10 +15318,"842 Kathryn Land Apt. 314 +South Elizabeth, GA 02562",371736066410465,1978-04-16 +51503,"8999 Diaz Lake +Aprilburgh, KS 94559",4653123256069,2006-02-18 +13169,"91699 Stephen Rue Suite 760 +South Michaelhaven, RI 23725",2570839903428412,1983-08-13 +24901,"323 Jennifer Flats Apt. 803 +Kellyborough, AL 81485",3549496001866492,2010-10-13 +69158,"290 Hines Ways +Emilyview, MN 22956",4617106729558,2025-10-20 +21670,"1972 Cynthia Roads +New Erica, VA 44085",3528858753944725,1998-11-04 +24955,"990 Watts Road +Andersonside, HI 07113",4345248576782706564,1974-09-16 +18727,"94587 Stone Points Suite 237 +New Julia, AK 08630",6528300295195642,2023-12-10 +50830,"1795 Regina Lodge Suite 369 +North Julieshire, MI 32537",372006841377804,1985-03-28 +20731,"286 Paul Flat Suite 119 +Port Rodney, KS 27156",30198665108136,1998-03-26 +62737,"3141 Suzanne Branch +Emilyside, AS 18360",675950123041,1993-09-24 +68751,"8329 Ewing Fields Apt. 801 +South Michaelmouth, MI 67119",4143882757324323,1979-01-19 +45401,"240 Melody Curve +East Crystal, AS 55935",4618314564765825,1992-03-21 +17149,"24401 Krystal Green Suite 231 +Michaelberg, NV 48157",3551751283722705,2001-03-25 +63901,"89298 James Brooks +Gonzalezport, ND 68891",3567158211486241,2002-06-03 +8714,"6102 Angela Street +Walshport, DC 10072",503885457872,2017-10-20 +54694,"7809 Shepherd Orchard +South Sarahbury, HI 57615",503878214520,1979-02-05 +48147,"265 Matthew Estate Suite 924 +Leeton, VI 17485",38898369598803,1993-12-12 +62805,"639 Smith Divide +East Jennifer, WA 18825",4741309587355,1971-03-26 +53735,"54914 Alexis Village +Pamelachester, FL 47230",4571100306550,2016-06-02 +33416,"5320 Frost Gardens Apt. 523 +Williamsmouth, MN 47514",5133087092047576,1972-10-21 +31666,"572 Gloria Ways Apt. 903 +West David, PW 72986",4166987320020,2014-01-29 +16216,"48361 Black Isle Suite 534 +Williamsberg, UT 53238",4251844208905793,1972-03-27 +33954,"0368 Ryan Lock +Bobtown, GA 38117",213118332129611,2008-12-05 +9142,"USNS Williams +FPO AP 02299",3530908657023526,2000-04-07 +57647,"001 Cowan Plains +Meaganland, ND 72441",4446716233155766,2009-08-20 +27122,"8849 Mayer Square +Medinaview, OK 72521",6553434172341553,2011-06-26 +9175,"3585 Wong View +Brianabury, WV 78311",4203114016864,2001-10-10 +7080,"1586 Denise Orchard Suite 035 +Ericmouth, WV 07891",3506667976015825,1999-12-17 +52396,"988 Jessica Pike +Randyside, LA 43790",3552979382421643,1991-03-22 +38064,"0160 Palmer Hill Suite 904 +Brownview, WY 57892",180029150441047,2018-12-07 +1634,"487 Aaron Cliffs Apt. 101 +Mendozashire, NM 48323",4604715249652822,1971-09-30 +70669,"83830 Matthew Wall Suite 226 +Ericaton, GU 15335",378297820925069,1990-02-08 diff --git a/metasyn/demo/shop_multi/products.csv b/metasyn/demo/shop_multi/products.csv new file mode 100644 index 00000000..5197f3b4 --- /dev/null +++ b/metasyn/demo/shop_multi/products.csv @@ -0,0 +1,501 @@ +id,name,current_price,stock +1580,clearly,$609.13,3 +19264,community,$54.45,4 +4101,late,$338.16,0 +41872,remember,"$3,634.52",4 +61339,billion,"$4,233.47",2 +72646,couple,"$83,755.55",9 +70315,debate,"$7,761.58",3 +63005,attack,$990.68,5 +22224,statement,$713.68,0 +75449,billion,$23.20,4 +54980,issue,"$3,547.18",8 +55122,order,$47.12,0 +67833,against,$91.80,6 +12079,land,"$9,669.70",8 +67834,particular,$6.49,8 +29519,under,$444.47,5 +32620,hear,$2.82,3 +28483,staff,$534.91,6 +79995,which,$85.29,2 +74149,laugh,$58.65,0 +4638,might,$22.96,9 +5177,machine,"$59,020.80",3 +42931,yeah,$8.41,2 +22829,eat,"$1,335.90",0 +49656,third,"$8,867.24",4 +45531,material,$6.10,1 +10497,result,"$21,586.38",1 +48092,school,$5.89,0 +17095,government,$76.69,5 +8568,feeling,$51.02,7 +59109,lay,"$67,352.36",0 +77960,with,$821.97,9 +81928,our,$8.59,6 +4926,light,"$42,136.50",7 +59938,build,$2.69,6 +37512,student,"$7,589.00",2 +18236,research,$6.87,7 +70093,imagine,"$5,935.23",4 +74135,fight,"$6,672.70",6 +47760,single,$26.78,8 +77203,whether,$13.51,1 +48933,expert,"$1,495.81",1 +7438,be,$7.38,8 +49933,sea,"$34,483.78",6 +12746,goal,$53.43,1 +36380,check,"$20,173.48",9 +3253,against,"$9,772.80",1 +6331,senior,$6.19,2 +76348,money,$7.85,6 +79144,support,"$7,946.55",9 +66975,free,"$2,888.83",5 +54268,hotel,"$63,411.48",1 +69749,side,$3.87,4 +81026,full,$9.87,2 +3161,weight,"$8,779.87",5 +82958,us,$8.13,6 +42181,establish,"$8,206.28",1 +47286,there,"$89,934.48",8 +71356,vote,"$35,233.15",6 +50760,chance,"$21,492.78",8 +36260,store,"$1,325.17",4 +67147,she,"$83,099.86",6 +47011,save,$250.00,6 +18126,guess,$1.14,6 +48615,notice,$551.55,9 +47738,try,"$3,231.35",7 +79968,product,$3.17,7 +30007,system,"$40,759.23",1 +22353,especially,$8.67,3 +26314,hour,"$35,793.92",2 +64458,station,"$54,764.29",1 +46530,hope,"$2,711.35",1 +76092,season,$564.45,1 +85059,everybody,$7.81,7 +14258,rich,$0.34,2 +71030,information,"$2,973.41",2 +53255,against,"$18,682.28",6 +25095,seek,$74.10,1 +17725,type,"$5,817.42",2 +35180,mission,$63.79,9 +18196,college,"$13,191.80",9 +50675,instead,$66.91,7 +59440,far,$308.05,1 +79413,subject,$9.82,2 +81580,health,$4.82,2 +12296,mother,$43.50,8 +53492,five,"$66,795.08",1 +63141,their,$939.49,3 +49648,can,$1.25,6 +21000,professor,$601.70,3 +83836,government,$9.05,9 +27643,page,$744.08,6 +63735,third,$13.69,1 +59172,produce,$38.60,1 +59868,certainly,"$7,656.12",7 +1965,item,$1.22,0 +18911,but,$2.29,1 +65943,another,"$6,125.34",0 +47518,yard,"$5,263.05",8 +43031,beautiful,"$3,368.12",6 +50479,notice,"$79,007.66",9 +37205,parent,$34.79,2 +37789,we,$917.38,7 +24476,however,"$99,484.44",0 +32281,fish,$6.56,0 +6169,wide,"$2,475.20",7 +25438,table,$573.25,1 +63895,shoulder,"$26,997.86",4 +66036,mean,"$2,108.65",8 +72748,listen,$0.88,5 +6889,difference,$3.93,5 +36878,moment,$203.13,3 +49625,since,"$87,981.44",3 +13569,ago,$3.78,5 +25665,reveal,"$80,279.69",4 +46348,no,$23.30,8 +55027,official,$791.36,1 +49213,point,"$90,878.62",3 +38774,must,$11.38,6 +14002,range,$47.20,8 +55221,put,$0.35,5 +14667,property,"$3,439.23",5 +80505,four,$377.73,3 +44610,food,$9.80,7 +10592,increase,$641.18,4 +72740,every,$59.28,1 +39400,soldier,$7.62,9 +66058,available,$5.60,8 +20024,official,"$6,448.19",8 +12852,note,"$98,582.41",7 +73663,mouth,"$2,741.77",0 +15563,value,"$62,593.36",1 +23203,research,$49.94,7 +27046,cultural,$269.19,7 +74692,card,"$30,132.40",1 +77296,forget,$7.54,9 +56692,many,"$8,454.40",3 +26685,thus,"$57,330.36",8 +14737,hotel,"$66,273.17",4 +26778,likely,$30.65,0 +26631,summer,$758.45,1 +26183,friend,"$5,912.23",3 +80081,shoulder,"$26,232.06",3 +6376,section,"$9,918.94",4 +3020,then,$16.26,3 +4476,bring,"$2,449.98",7 +15336,hard,$314.34,6 +85172,agent,$61.76,7 +9964,reduce,$9.55,7 +27075,open,$1.87,0 +42447,want,$6.58,4 +52410,agree,$131.30,9 +44781,tax,$47.12,1 +959,seat,$241.48,2 +50713,might,$14.67,3 +72278,brother,"$6,483.72",0 +1633,top,$17.66,4 +51505,audience,"$15,227.76",0 +866,doctor,$0.02,1 +69167,ever,$5.56,4 +16363,already,$598.68,8 +2467,under,$708.33,5 +81872,we,"$85,789.46",2 +43625,south,"$3,188.57",0 +9872,toward,$7.37,6 +41015,ago,$9.83,6 +23256,own,$411.05,2 +25928,current,$994.78,1 +20417,serious,"$22,499.44",4 +47682,suggest,"$39,637.97",9 +14701,begin,$5.57,4 +1273,guess,$9.66,7 +55177,general,"$21,105.83",4 +72275,policy,$4.32,4 +18974,popular,$82.81,8 +3931,fine,"$67,428.02",5 +33397,eight,$99.07,7 +1382,treatment,$6.96,3 +38622,room,"$5,858.52",7 +29497,movement,"$7,869.75",6 +60350,its,"$10,628.47",7 +67979,officer,"$9,285.92",0 +64243,teach,$317.15,7 +57191,situation,$726.62,5 +26429,behind,$0.85,4 +43665,appear,$2.21,9 +65496,across,$409.48,7 +19825,avoid,$9.00,9 +76920,task,$962.52,9 +13982,manager,$7.19,9 +48958,season,"$97,571.57",4 +21243,perhaps,$131.09,8 +28519,yes,$17.45,6 +35612,again,$71.71,9 +24691,part,"$4,678.07",7 +30792,toward,$595.40,8 +22249,my,"$5,342.75",8 +25537,song,$247.67,7 +50544,ahead,"$37,012.98",0 +32207,take,$225.27,1 +59966,tell,"$2,872.89",5 +9482,seem,"$83,190.59",0 +79040,brother,"$6,739.35",5 +5226,seek,$93.90,0 +23894,dinner,$753.70,7 +57610,study,$620.30,8 +15905,ability,$5.73,9 +456,old,$1.14,9 +30663,also,$84.10,9 +41545,manage,$9.19,7 +47349,factor,$19.47,1 +17621,free,$818.40,8 +79637,wide,"$19,864.64",7 +23175,not,$0.01,1 +25842,start,"$70,770.35",5 +21887,production,$4.71,0 +33500,bag,$0.96,9 +37035,think,$150.56,8 +53099,road,"$75,762.10",2 +81131,message,$901.34,1 +42572,particular,$74.96,5 +54770,treatment,$4.27,2 +18733,thought,"$85,332.32",2 +3389,determine,$80.45,4 +42219,brother,$6.36,9 +46015,past,"$6,415.91",8 +7049,house,"$79,898.11",8 +79141,pretty,$22.69,4 +46141,detail,$392.65,4 +9377,forget,$69.89,6 +15161,no,$240.83,2 +65335,source,$13.82,5 +3745,human,"$1,586.70",9 +19245,artist,$22.99,3 +16827,social,"$41,441.02",4 +29830,health,$405.14,4 +81264,sometimes,"$4,364.94",9 +69144,western,"$78,614.91",5 +62835,world,$65.52,3 +22232,phone,$846.30,3 +74792,interest,$642.93,6 +24479,become,$58.20,7 +12552,remember,$5.85,7 +29081,race,$3.65,3 +34907,artist,$805.90,6 +35266,into,$1.44,5 +5133,scientist,$39.80,1 +84351,over,$9.12,7 +760,quite,$46.65,9 +41938,treatment,"$57,299.84",9 +55365,network,$553.11,4 +32348,physical,$0.56,9 +30135,mind,$449.32,0 +64818,property,"$3,017.36",9 +79887,style,$61.49,7 +10385,future,"$48,683.84",9 +68447,rule,$31.18,7 +40764,whether,$437.49,9 +84684,require,$6.91,8 +15749,series,"$8,206.38",6 +3277,need,$2.61,6 +65850,market,$96.61,6 +10222,tend,"$7,650.79",9 +31255,story,$2.95,0 +54757,gun,$14.38,4 +3520,test,"$3,245.11",0 +70205,pull,$71.24,8 +29741,state,"$77,815.41",2 +21162,sort,$417.87,8 +40672,physical,"$12,158.52",6 +24239,ball,$992.67,1 +9479,key,"$44,303.85",9 +81095,far,$255.86,0 +14801,cause,"$71,040.58",1 +20127,word,$28.56,4 +56507,professional,"$3,167.81",7 +79674,up,$661.36,2 +38868,say,$799.59,9 +27178,answer,$417.21,1 +45392,any,"$30,311.97",7 +36753,too,$45.81,0 +10903,art,"$40,507.28",6 +20241,company,$0.30,4 +22573,face,$36.62,9 +62581,above,"$23,378.44",0 +84060,toward,$17.45,8 +16123,ok,$48.76,3 +53758,exist,$64.07,0 +21923,beautiful,$8.96,6 +41405,there,"$23,203.02",7 +71127,continue,$42.89,6 +84365,down,$37.78,4 +24932,kitchen,$0.31,6 +75395,government,$8.78,7 +8978,interesting,$596.79,5 +61863,day,$8.04,4 +82524,suddenly,$24.38,0 +37389,western,"$35,085.27",2 +60812,suddenly,$506.69,0 +65636,could,$113.21,1 +69651,own,"$30,267.65",4 +84908,see,$3.81,2 +23588,teacher,"$4,986.47",0 +11767,security,"$4,955.62",5 +15439,strategy,"$7,533.47",9 +77910,talk,$3.67,8 +12244,reach,"$47,423.64",8 +738,finally,$3.51,1 +28774,describe,"$80,334.54",5 +51691,part,$99.13,5 +56103,green,"$42,599.22",7 +80709,learn,"$53,195.07",9 +62555,against,"$60,338.65",2 +62612,truth,$65.28,4 +77079,worker,$103.11,3 +4841,get,$2.68,5 +36861,night,$1.88,1 +60955,require,$407.33,5 +31341,wide,"$76,425.78",6 +19713,deal,$31.05,2 +79568,population,$737.11,0 +13117,throughout,"$4,421.79",7 +35611,loss,$664.83,7 +51447,building,$59.01,2 +37931,kid,"$5,651.43",2 +46134,media,"$73,402.01",6 +24772,eye,"$7,128.97",1 +49916,involve,"$3,761.77",8 +66061,development,$95.22,5 +60031,true,$1.31,7 +599,senior,$983.01,9 +75731,serious,$58.66,8 +18616,character,$2.89,1 +54138,sit,$52.50,1 +16776,phone,$691.81,1 +64694,yeah,"$5,379.40",3 +31360,market,$6.16,4 +50067,least,$72.03,7 +50592,painting,$85.30,1 +27971,better,$94.44,5 +75824,choose,"$49,800.98",3 +54339,book,$0.51,6 +15513,shoulder,"$8,757.31",2 +34294,statement,"$4,995.18",8 +59155,ahead,"$9,335.07",6 +26201,clear,$4.39,4 +38437,body,"$75,137.25",2 +76070,thank,"$5,791.66",2 +73995,special,$35.58,2 +46837,keep,$47.97,6 +13407,cost,$451.68,0 +83541,well,$4.42,1 +8918,relationship,"$2,080.51",5 +81667,become,$54.62,2 +52675,meeting,$362.55,9 +46334,owner,$3.63,0 +44409,receive,"$95,213.11",6 +59706,knowledge,$50.87,6 +33453,follow,$8.09,1 +44031,along,"$82,750.83",8 +71054,allow,$839.67,6 +61282,day,$31.66,2 +20250,wall,$157.00,9 +369,attention,$4.88,6 +2396,realize,$493.93,8 +45097,occur,"$7,734.52",7 +79163,people,"$24,741.18",8 +45383,four,$247.37,6 +72660,each,$6.78,0 +62900,discuss,"$26,126.80",0 +56341,treatment,"$61,295.91",0 +46753,or,"$2,324.30",8 +38506,call,"$1,538.36",8 +1973,of,$459.18,0 +7455,simply,$42.23,1 +76556,wonder,$77.24,6 +38060,blue,"$40,082.77",3 +70888,stand,$917.67,4 +11612,shoulder,$887.41,0 +10715,maintain,"$59,731.70",3 +43982,property,$6.12,4 +20618,purpose,"$15,031.47",3 +48672,make,"$82,325.80",0 +43722,break,"$95,906.21",8 +21579,show,"$72,472.04",0 +65693,start,$195.12,8 +35085,summer,$0.25,9 +6051,suddenly,"$8,879.19",9 +47038,interview,$940.84,5 +40174,outside,$52.41,1 +52202,who,$460.50,3 +84827,every,"$52,363.64",4 +16927,fact,$122.32,2 +51952,foot,$72.38,0 +45816,next,"$9,549.82",9 +68701,return,"$7,522.64",3 +36811,peace,$811.90,0 +39982,Republican,$674.57,6 +48581,defense,"$35,354.65",6 +18876,action,"$84,598.50",6 +44285,sometimes,$624.44,7 +15329,clearly,$776.29,5 +81301,pressure,"$6,579.74",2 +9294,simple,"$60,373.89",8 +21396,society,$278.61,5 +79323,himself,"$79,477.72",9 +42732,brother,"$9,786.49",0 +78330,career,"$6,900.12",0 +25669,measure,"$5,427.55",5 +15698,dog,$34.30,8 +77828,security,"$67,026.70",9 +5086,material,$891.27,3 +15957,both,$611.50,7 +46774,view,$4.87,0 +55225,deal,$432.35,9 +68521,system,$6.77,7 +71590,arrive,$4.24,3 +65971,hold,$102.62,8 +53512,scene,$1.05,0 +42951,drop,$31.35,6 +31414,military,$629.67,1 +45884,body,"$93,034.81",6 +28820,example,"$96,404.24",6 +3724,create,"$7,324.34",4 +6513,foot,"$70,108.83",6 +47657,recently,$594.86,2 +22794,current,$89.42,0 +21829,bar,$74.14,9 +53247,shoulder,$64.23,4 +45226,law,$6.42,8 +52387,third,"$35,054.38",4 +7596,debate,$148.60,8 +10953,evidence,"$50,678.76",7 +85293,sometimes,$762.53,6 +20325,change,$886.12,9 +61695,vote,"$10,195.53",1 +9478,local,$667.19,4 +63718,several,"$25,402.26",9 +73509,meeting,"$4,733.97",5 +48928,cut,"$65,870.22",6 +60986,decide,$70.58,1 +71355,board,$87.61,3 +34683,available,$736.11,3 +12082,shoulder,$36.79,5 +61120,ability,$938.71,4 +47219,story,$2.64,8 +68273,of,"$5,342.16",0 +36893,let,$3.38,0 +63599,else,$56.34,5 +49366,receive,"$28,758.55",8 +10651,score,$85.09,4 +8663,Mrs,$369.13,3 +27276,whole,"$67,700.39",4 +65443,among,"$40,344.79",2 +77065,herself,"$94,992.14",5 +79629,president,"$5,709.10",4 +63696,budget,$904.16,2 +43386,town,$6.80,8 +70977,something,$383.45,9 +35981,edge,"$5,008.19",0 +64490,military,$51.03,1 +30783,everything,"$50,467.46",4 +23161,others,"$4,111.39",6 +71449,case,$1.57,6 +53543,project,$6.94,6 +2877,between,$64.63,2 +28311,sell,$779.58,5 +60416,half,$3.43,4 +6028,drive,"$16,114.86",9 +62645,sound,$3.38,6 +31605,participant,$35.72,1 +38394,maybe,"$4,917.92",8 +75474,feeling,$481.18,7 +53877,sure,$5.94,4 +35872,available,$51.79,0 +4172,food,$3.11,0 +23445,body,"$1,651.79",1 +70746,begin,"$73,326.41",0 +6739,realize,$40.61,1 +77356,arm,"$8,713.98",2 +85278,quality,$8.60,0 +40663,generation,"$1,387.04",1 +5422,member,$743.07,8 +46000,among,"$40,264.15",7 +35430,address,$317.74,2 +65189,believe,"$5,809.06",6 +74986,yet,"$2,019.28",3 +59103,relationship,$4.12,3 +31788,tree,$10.99,1 +39405,evidence,"$96,828.69",1 +44713,pay,$9.32,0 +50680,prevent,$4.58,0 +80060,education,$1.51,4 +55490,factor,$9.71,7 +22199,hour,$4.76,4 +20217,meet,"$5,419.38",1 +78221,produce,$1.31,2 +63192,trade,$45.96,5 +43873,language,$3.21,9 +81942,seat,"$19,949.25",1 diff --git a/metasyn/demo/shop_multi/purchases.csv b/metasyn/demo/shop_multi/purchases.csv new file mode 100644 index 00000000..90e338b8 --- /dev/null +++ b/metasyn/demo/shop_multi/purchases.csv @@ -0,0 +1,1001 @@ +id,customer_id,price_paid,product_id +0,8579,"$78,287.51",37931 +1,54694,$901.94,30783 +2,66228,$48.93,80060 +3,53735,"$62,333.64",68273 +4,27958,$2.95,3520 +5,18662,"$2,099.40",10385 +6,1634,"$64,088.92",13407 +7,59360,$33.37,48928 +8,50169,$31.87,23175 +9,6422,$3.94,54138 +10,10383,$3.74,22232 +11,19895,$59.74,12296 +12,21156,"$97,830.22",53255 +13,46708,$5.48,38868 +14,8579,"$54,429.80",42951 +15,24901,$9.45,46015 +16,37659,"$80,193.24",47682 +17,57304,"$2,417.72",45097 +18,53572,"$8,852.67",48615 +19,29586,$131.50,35085 +20,38064,$22.85,67147 +21,42839,$80.88,81131 +22,24955,"$29,402.81",29497 +23,68862,$30.12,22573 +24,57647,$1.23,27075 +25,56855,$41.58,63696 +26,19211,$85.09,77910 +27,69118,$4.37,21000 +28,49833,$2.16,59938 +29,3109,"$73,931.41",26778 +30,29315,"$89,059.47",20241 +31,44382,$82.72,2396 +32,21156,"$29,202.14",24772 +33,61274,$81.71,43982 +34,26804,"$53,714.10",67147 +35,70669,$1.49,7049 +36,16216,$6.80,25438 +37,42057,$306.17,77356 +38,35147,"$7,860.07",37205 +39,20062,$99.86,47011 +40,13535,"$5,026.02",75449 +41,11568,"$8,249.50",35872 +42,37659,"$6,730.43",48958 +43,62805,"$1,702.08",79674 +44,16097,$26.56,369 +45,19884,"$52,523.32",79674 +46,14278,$8.54,5422 +47,43669,"$6,889.31",37035 +48,46346,$56.36,79323 +49,25571,$76.99,3020 +50,52904,$59.34,66061 +51,71286,"$2,388.22",10222 +52,42779,"$1,098.60",20217 +53,13169,$8.80,77065 +54,8714,$381.22,39405 +55,52098,$9.18,43722 +56,53572,$976.36,22232 +57,70515,"$75,704.59",35430 +58,25571,"$9,168.44",52410 +59,62926,$448.82,46348 +60,38064,$716.01,47738 +61,538,$360.36,77065 +62,61591,$447.89,59440 +63,538,$987.95,53758 +64,44905,"$6,925.73",20241 +65,68765,"$5,812.90",22794 +66,52930,$93.83,22249 +67,63901,"$1,682.51",48928 +68,29809,"$8,271.58",18911 +69,1869,$62.49,32620 +70,56855,$8.70,81131 +71,46708,"$49,541.32",54757 +72,42057,$65.96,80709 +73,9142,$2.01,50067 +74,61274,$744.61,369 +75,39630,"$12,100.32",8918 +76,63322,"$62,404.58",85172 +77,51257,"$6,995.79",71590 +78,28257,$882.08,18974 +79,8714,$976.67,44610 +80,6422,"$5,383.42",4476 +81,18727,$8.56,18911 +82,48110,$6.08,5133 +83,50847,$61.97,75395 +84,29023,"$8,589.55",38622 +85,69158,"$1,485.61",22829 +86,41864,"$71,395.32",84908 +87,8442,$2.69,60416 +88,53301,$607.68,77296 +89,28418,"$76,721.45",26429 +90,63322,"$1,855.34",16363 +91,51471,$1.05,39400 +92,45401,"$34,979.21",41405 +93,23038,"$62,332.97",15329 +94,28257,$48.37,85293 +95,33954,"$2,480.48",24479 +96,56855,"$46,705.53",30783 +97,44905,"$8,978.80",13117 +98,61274,"$9,452.24",53758 +99,33416,$8.93,14701 +100,53301,"$9,593.11",18126 +101,31770,$23.26,22232 +102,41730,$5.75,62835 +103,1634,$863.09,77296 +104,12154,$2.27,23445 +105,69118,"$72,900.21",3931 +106,54694,"$18,264.16",53877 +107,51451,$987.33,18876 +108,48937,"$52,280.68",14667 +109,71286,"$8,645.69",60031 +110,48937,$561.57,43722 +111,5936,$0.61,62835 +112,62971,$7.49,85293 +113,35106,$308.52,35611 +114,67458,$6.10,46774 +115,57543,$2.58,35612 +116,35106,$31.34,60416 +117,32862,"$44,731.72",35085 +118,51427,$454.36,72275 +119,42839,"$70,018.75",14737 +120,34983,$601.49,22794 +121,11558,$64.56,56341 +122,63322,$18.83,25669 +123,21453,$37.19,47219 +124,56876,$7.96,42931 +125,68862,$649.95,60812 +126,64448,$59.34,55225 +127,34271,"$14,918.03",15513 +128,70669,$33.35,17725 +129,19884,$43.32,56692 +130,48147,$212.26,66061 +131,62926,$5.83,16363 +132,29023,$6.21,29081 +133,52930,$66.92,49213 +134,31666,$968.73,74692 +135,18662,"$49,799.74",9478 +136,27122,"$4,532.98",19713 +137,9175,$8.35,55490 +138,42839,$8.84,29830 +139,24366,$487.31,84351 +140,27122,"$36,123.68",59868 +141,37428,$0.37,16776 +142,41730,"$16,527.04",72275 +143,38111,$753.74,49916 +144,64431,$8.42,54138 +145,60826,"$56,829.17",29830 +146,8442,"$2,795.81",27046 +147,69158,"$7,755.08",4841 +148,57809,"$96,541.72",60812 +149,27122,"$92,168.71",50760 +150,19895,$729.20,53099 +151,35147,"$2,398.97",11767 +152,41730,$22.64,55365 +153,538,"$9,018.67",31341 +154,68862,$113.03,31605 +155,42839,$8.74,10953 +156,23038,$633.94,69144 +157,52396,"$69,875.59",62581 +158,22363,$8.85,18911 +159,37428,"$64,727.57",36878 +160,48937,"$1,861.52",1580 +161,52930,$369.12,67147 +162,28418,"$78,856.33",2396 +163,29313,$17.66,5226 +164,6931,"$33,721.51",36811 +165,6422,$6.74,65636 +166,538,$882.85,65850 +167,26804,$5.83,62835 +168,48937,$449.13,14667 +169,23486,$7.20,65335 +170,9720,$9.00,20250 +171,28375,$318.37,55221 +172,19895,$11.94,64818 +173,67873,$50.39,24932 +174,13182,"$38,059.25",84827 +175,6422,"$10,780.10",72660 +176,14037,$595.88,63192 +177,53735,$149.99,32207 +178,67458,"$4,683.02",54268 +179,11568,$86.17,8568 +180,54694,"$3,291.83",21829 +181,50554,$98.30,33453 +182,64448,"$5,646.12",70977 +183,9720,"$1,033.86",68701 +184,17518,$3.08,70205 +185,35106,$85.61,18126 +186,54694,$0.67,60031 +187,24955,$12.10,48615 +188,7080,$83.26,70888 +189,67873,$85.53,46530 +190,28699,"$7,627.39",47011 +191,60985,"$6,445.55",3745 +192,35996,$359.84,23161 +193,37428,$94.82,53877 +194,14278,$1.93,74986 +195,57256,$342.94,47038 +196,50314,$6.48,41545 +197,69158,"$95,262.30",29497 +198,41864,$75.77,81928 +199,11568,$483.50,15336 +200,24955,"$37,002.09",84827 +201,73006,"$79,490.99",9479 +202,61772,$5.17,28774 +203,538,"$6,942.59",40663 +204,216,$74.32,55225 +205,62926,"$7,201.51",44713 +206,49833,$488.52,48581 +207,33954,$1.52,42219 +208,13065,"$27,161.34",959 +209,49018,$75.49,48092 +210,39630,$73.42,21396 +211,5936,$74.36,38868 +212,71286,$80.44,31605 +213,49018,"$5,451.15",60986 +214,18662,$861.29,55490 +215,16853,$8.77,63599 +216,4220,$778.59,79040 +217,63994,$8.31,64694 +218,54694,"$23,071.37",75824 +219,43669,"$1,803.14",46000 +220,24955,$5.30,77910 +221,39707,$943.66,28774 +222,37428,$2.11,3389 +223,22363,"$4,671.81",60812 +224,22057,$28.14,17621 +225,60985,"$68,233.43",46134 +226,41864,$8.50,64694 +227,48937,$26.51,61339 +228,21453,$5.85,50713 +229,24901,$68.11,21162 +230,12154,$221.29,4638 +231,54591,$70.13,46000 +232,6422,$309.86,40672 +233,31770,$444.61,79887 +234,33416,"$10,422.11",10903 +235,17149,$881.47,49648 +236,46708,$24.72,22829 +237,50314,"$82,672.55",55221 +238,31770,"$5,084.75",12746 +239,68751,$187.77,11767 +240,63994,"$75,161.61",75449 +241,69118,$41.43,76556 +242,23486,$96.51,16927 +243,53735,$86.07,70977 +244,14278,"$25,279.73",66061 +245,50847,$8.91,63599 +246,70515,"$74,725.62",53255 +247,49833,"$3,562.17",56692 +248,27521,$498.33,64694 +249,23486,"$90,754.06",35085 +250,20731,"$55,144.22",24691 +251,51503,$749.29,81872 +252,25571,$77.63,60416 +253,37545,"$4,176.44",81301 +254,44382,$663.08,3520 +255,52098,"$19,480.48",70315 +256,51279,"$12,546.15",18196 +257,28418,$1.76,55221 +258,49041,$90.49,28820 +259,43669,$0.90,20250 +260,28760,"$9,073.86",46837 +261,49018,$53.95,47219 +262,14037,$901.40,5226 +263,64030,$39.37,28774 +264,38111,$9.15,6028 +265,32125,$1.07,79144 +266,44382,$308.03,29830 +267,538,$1.55,71590 +268,58050,"$88,254.71",53099 +269,20731,$78.55,63192 +270,5936,$4.40,14002 +271,50169,"$7,872.02",64818 +272,34983,"$62,550.58",81026 +273,51279,$59.68,47011 +274,50169,"$48,920.51",79323 +275,64030,"$63,535.11",22224 +276,64448,$84.49,52202 +277,56855,"$28,055.28",20250 +278,5936,"$7,760.49",10497 +279,8714,$16.99,38506 +280,57647,$7.57,4172 +281,70682,"$88,792.24",65943 +282,46346,$1.04,65850 +283,56855,$22.13,26314 +284,28418,$223.89,44610 +285,24901,$716.18,52675 +286,60213,$72.40,1382 +287,68862,$897.21,6739 +288,42679,$611.11,14002 +289,17518,"$43,518.62",81580 +290,59360,$5.42,20618 +291,18662,$415.93,41938 +292,9720,"$58,584.38",38060 +293,25571,"$8,603.94",60955 +294,33520,"$87,979.90",30783 +295,9142,$660.06,35981 +296,67873,$36.44,15513 +297,42779,$5.86,35085 +298,51503,$325.29,67979 +299,62971,"$5,814.11",11767 +300,41730,"$7,763.88",22232 +301,60357,$8.68,46000 +302,18727,$514.64,25095 +303,60985,$1.94,31605 +304,48937,$1.09,6513 +305,9827,"$9,807.43",56507 +306,42679,$7.65,25537 +307,7080,"$4,699.37",62900 +308,45401,$48.84,48958 +309,63901,"$3,927.22",79141 +310,18727,"$78,886.34",15698 +311,69118,$21.67,47760 +312,60762,$8.98,43722 +313,5936,$242.97,13407 +314,34983,$25.45,47219 +315,60357,$407.33,10222 +316,28699,"$6,542.61",82958 +317,6422,$59.01,15161 +318,6422,"$5,505.05",7455 +319,59386,"$25,814.32",34907 +320,52098,$301.50,40663 +321,29023,"$87,804.77",24772 +322,62926,"$35,851.96",47219 +323,62805,"$66,827.00",35180 +324,56991,"$2,373.38",67833 +325,42679,$90.51,9482 +326,48110,$157.43,68701 +327,8579,$290.85,7049 +328,32125,$763.38,46134 +329,60357,$737.70,53099 +330,67458,$58.19,71449 +331,48110,$241.09,63735 +332,37659,$2.65,85278 +333,1634,"$4,364.70",39400 +334,61591,"$1,737.79",25438 +335,44905,"$28,471.43",81131 +336,51427,$81.74,31255 +337,62805,$20.54,19825 +338,9827,"$54,072.82",3724 +339,73006,"$6,138.85",9479 +340,42679,"$75,623.50",55122 +341,17998,$5.12,79995 +342,9720,$535.51,54268 +343,9827,$93.32,63192 +344,60985,"$62,515.62",35981 +345,37428,"$99,810.96",47011 +346,71286,"$64,815.74",3277 +347,42337,$292.73,67834 +348,64030,$955.75,54770 +349,51991,$9.54,63718 +350,24955,$22.24,44610 +351,68862,"$36,770.70",51505 +352,24366,$7.04,67979 +353,68862,"$1,205.31",53543 +354,63994,"$5,685.39",8568 +355,42839,"$59,647.15",12079 +356,37545,$81.56,20325 +357,3109,$75.79,36811 +358,70515,$3.01,31255 +359,15318,"$64,638.40",15513 +360,52904,$514.08,47011 +361,41080,"$14,888.02",28774 +362,60357,"$8,408.53",65943 +363,11558,"$77,444.84",36260 +364,70682,"$2,575.83",55365 +365,49041,$4.48,16776 +366,61274,$91.39,38868 +367,61274,"$5,031.11",55122 +368,13065,$625.38,62900 +369,57256,"$5,085.10",10592 +370,62926,"$68,027.18",52202 +371,32862,$4.33,44409 +372,51471,$162.50,59155 +373,70682,"$77,707.95",43386 +374,16853,$8.47,65335 +375,59360,$745.26,8568 +376,37644,$283.75,38506 +377,538,"$5,544.66",25928 +378,19211,$276.61,369 +379,9720,"$7,902.18",65189 +380,17998,$243.83,17095 +381,70515,$777.52,56341 +382,45401,"$8,074.80",18196 +383,51471,$859.85,46348 +384,13065,$15.39,42572 +385,8579,$780.42,24691 +386,64448,$498.32,33500 +387,15318,"$44,417.25",25665 +388,53299,$74.04,760 +389,14037,$2.33,21162 +390,52396,"$3,748.18",53492 +391,27958,"$5,785.24",66975 +392,46754,"$26,480.13",59155 +393,64448,$488.01,28820 +394,13182,$80.53,13982 +395,19884,"$5,303.21",456 +396,71286,$96.65,53255 +397,13169,$75.82,72660 +398,38111,$16.33,71590 +399,48147,$70.32,23161 +400,35147,$25.59,34907 +401,24366,"$4,528.49",67147 +402,31666,"$21,583.47",45884 +403,58050,$499.21,27178 +404,13169,$9.09,67979 +405,38111,$320.96,39982 +406,62926,$126.26,69167 +407,17998,"$5,002.95",46334 +408,24901,"$7,978.26",83541 +409,17149,"$49,129.42",9964 +410,44382,$863.43,9294 +411,52098,"$8,902.95",65335 +412,24955,$580.04,67834 +413,11568,$5.73,47518 +414,29023,$4.16,53247 +415,30968,$436.51,35981 +416,45401,$65.48,36753 +417,17518,"$98,863.94",43665 +418,11558,$9.11,77296 +419,51471,$924.64,73509 +420,36001,$81.58,45383 +421,21330,"$2,177.19",75395 +422,57543,$14.30,74792 +423,49833,$12.96,5086 +424,37428,$34.62,19713 +425,62737,"$81,084.16",26685 +426,53299,"$18,783.63",53543 +427,57304,$10.45,17621 +428,26804,"$20,784.38",29830 +429,13169,"$77,419.76",31341 +430,28418,"$94,222.62",1382 +431,29586,"$53,714.95",18876 +432,19895,"$4,471.26",49656 +433,11568,"$19,371.90",79995 +434,50554,$66.07,10592 +435,49018,$236.49,4172 +436,29313,$363.70,21162 +437,25571,"$4,062.79",55490 +438,42779,$69.76,28820 +439,17518,"$82,155.43",40663 +440,61591,"$84,317.66",6513 +441,54591,"$52,829.65",6376 +442,48110,$3.09,27075 +443,50169,"$39,932.59",41938 +444,12034,$6.69,59440 +445,14278,"$5,861.71",80709 +446,48954,$8.56,27643 +447,24366,$231.39,1973 +448,20731,"$7,413.78",36753 +449,5936,"$88,705.36",45884 +450,42057,"$47,916.54",38506 +451,70515,"$2,386.67",74792 +452,48147,$44.80,31788 +453,57256,$82.63,84827 +454,70515,$80.24,25537 +455,53301,$14.61,67979 +456,42057,$43.69,9478 +457,57304,$4.84,6889 +458,64448,$4.73,959 +459,57256,"$10,297.85",72748 +460,48954,$5.29,14737 +461,9175,"$44,988.84",35180 +462,35106,$8.22,20618 +463,21453,$12.16,30135 +464,52098,"$49,409.74",79163 +465,55472,"$43,401.08",14667 +466,41080,$14.78,21000 +467,42057,$13.57,84827 +468,50314,"$5,077.65",24932 +469,42839,"$30,841.84",24691 +470,24366,$214.61,74692 +471,29586,$8.30,52410 +472,53299,"$6,623.68",6028 +473,538,$704.26,6513 +474,49041,$4.57,13569 +475,49876,$37.33,51447 +476,19895,"$7,997.78",13407 +477,13065,$436.61,82524 +478,36001,$4.47,79887 +479,29313,"$23,483.65",15161 +480,35147,$9.51,15957 +481,68862,$313.67,13569 +482,18727,$843.08,25842 +483,7080,"$2,891.33",68701 +484,56991,$63.89,369 +485,35106,"$48,336.43",41015 +486,9738,$98.53,53099 +487,9175,$427.75,30007 +488,39630,$94.26,18974 +489,13066,$3.17,66036 +490,28418,"$1,998.46",10953 +491,52396,$5.05,79887 +492,14037,$69.71,71355 +493,57647,"$99,516.70",20618 +494,9142,"$7,604.96",25438 +495,33954,"$6,838.68",54339 +496,39707,"$95,277.90",72740 +497,43669,"$8,396.96",36753 +498,13066,"$38,613.62",72275 +499,21330,$7.63,9377 +500,20731,"$49,707.95",738 +501,44905,$580.96,63696 +502,44905,$8.78,80060 +503,37545,"$9,465.36",81095 +504,51503,$42.00,31788 +505,45401,$799.71,53255 +506,38064,$6.46,40672 +507,8579,$18.09,53247 +508,24366,"$96,676.04",81264 +509,62971,"$2,119.63",46015 +510,9175,$160.87,62645 +511,1869,"$19,813.63",57191 +512,38064,"$17,625.32",65636 +513,52232,$837.94,49656 +514,20062,$9.57,74135 +515,20062,$931.71,12552 +516,15318,"$4,896.54",47518 +517,11568,$881.56,25095 +518,21670,"$7,093.31",74692 +519,48954,$400.47,54138 +520,58775,"$6,156.09",70205 +521,57543,$0.74,4172 +522,60213,$19.06,14667 +523,66228,$210.72,2467 +524,51451,"$78,005.18",37789 +525,38111,$7.19,65189 +526,35147,"$95,670.34",1273 +527,7065,$288.44,10953 +528,9142,"$1,005.94",35611 +529,13066,"$4,491.47",22353 +530,9738,$99.46,47738 +531,64431,$132.51,5177 +532,63901,"$34,582.26",21923 +533,49018,$4.92,81131 +534,36001,$10.33,12079 +535,41730,"$3,381.05",20325 +536,51471,"$12,151.81",30783 +537,68862,$1.64,45097 +538,63901,"$35,657.50",29497 +539,21453,$18.60,14258 +540,49876,$2.03,65971 +541,14278,"$64,592.50",9872 +542,31770,$471.39,47682 +543,17518,"$53,446.26",42572 +544,24366,$20.91,47349 +545,28163,$593.96,14667 +546,24955,"$1,230.66",47760 +547,43669,"$24,274.80",7438 +548,23038,"$5,260.16",25537 +549,10383,$9.98,63192 +550,57544,"$7,627.58",84365 +551,62429,$90.93,37035 +552,5936,$1.95,49648 +553,48937,$0.41,79995 +554,54694,$31.33,18236 +555,33520,$0.29,27971 +556,55472,$13.97,45383 +557,53301,"$4,934.34",81928 +558,44382,$83.74,53512 +559,46708,$152.91,12082 +560,59360,$6.03,3277 +561,69118,$482.96,59440 +562,8442,"$50,799.58",37205 +563,13169,$310.48,50067 +564,49041,$967.99,4926 +565,52904,$778.35,599 +566,8198,"$8,732.55",69651 +567,38064,$1.87,79144 +568,35996,"$3,842.48",40672 +569,8579,$93.18,63005 +570,33520,$6.71,63895 +571,42057,$8.63,16123 +572,61772,"$2,341.81",66036 +573,57647,"$43,980.18",49366 +574,16853,$439.89,36380 +575,42839,$5.87,10497 +576,6931,"$59,480.13",5422 +577,9142,"$55,926.05",45816 +578,29023,$961.72,47682 +579,51991,"$12,679.50",53247 +580,42679,$9.18,29830 +581,13066,"$58,045.34",15957 +582,45401,$282.07,70977 +583,57809,"$1,378.27",41938 +584,50554,"$17,735.24",21579 +585,41730,$961.50,70093 +586,46346,$409.26,72740 +587,6931,$468.97,63895 +588,62805,$654.42,55490 +589,61772,$78.74,61695 +590,59386,"$35,891.32",83541 +591,49833,"$53,366.53",27046 +592,21670,$1.63,9294 +593,57809,"$87,012.87",7049 +594,52098,"$27,789.83",54138 +595,56991,"$3,891.75",70315 +596,9142,$901.02,77828 +597,4186,$24.91,71127 +598,29809,"$60,489.85",50680 +599,38111,$9.71,79629 +600,49876,"$29,819.19",15329 +601,41864,"$9,505.18",66036 +602,42679,$3.84,85059 +603,17518,$14.64,19264 +604,57647,$397.79,60350 +605,1634,"$31,404.27",12082 +606,59360,"$7,533.93",55177 +607,28257,$2.63,84060 +608,63322,$217.01,61282 +609,13535,$28.42,61120 +610,11568,"$1,077.12",77079 +611,68751,"$2,532.70",65443 +612,58050,$47.88,79141 +613,67458,"$39,582.70",40764 +614,216,$258.07,27075 +615,44382,$7.25,62581 +616,31770,"$16,503.16",81301 +617,68862,"$4,727.26",1273 +618,57543,"$3,310.31",6169 +619,59360,$0.45,2467 +620,8442,$97.65,71590 +621,42057,"$14,984.21",23445 +622,51427,$5.85,1633 +623,11568,$98.06,77828 +624,216,"$21,463.39",27075 +625,36001,$3.91,40174 +626,48110,$49.25,599 +627,45941,$19.55,24476 +628,41080,"$5,052.19",36753 +629,7065,"$73,101.29",31255 +630,56855,$154.19,79323 +631,28163,"$29,590.27",43386 +632,51991,$11.85,20127 +633,52904,"$9,273.69",24239 +634,35996,"$11,746.45",1382 +635,56855,"$6,188.01",61863 +636,28257,"$4,036.91",65850 +637,58050,$21.21,8568 +638,31278,"$6,054.22",3020 +639,27521,$92.99,45816 +640,70669,$54.63,72660 +641,33416,"$90,489.13",3277 +642,27958,$29.84,40672 +643,31770,$44.88,2467 +644,51991,$281.68,43722 +645,44382,$8.09,18911 +646,41080,$47.32,15329 +647,61772,"$51,926.44",19713 +648,53735,"$1,392.98",31341 +649,8579,$125.66,9478 +650,41730,"$62,563.33",84365 +651,6422,$42.59,22829 +652,18727,"$48,870.76",959 +653,29313,$1.93,46134 +654,21330,$51.05,14701 +655,73006,"$4,641.88",599 +656,64448,$5.96,63718 +657,216,$687.06,55122 +658,68751,"$9,676.64",37789 +659,13066,"$8,785.73",71449 +660,55472,"$9,672.06",84908 +661,29809,"$7,563.60",66061 +662,66228,"$44,876.62",43031 +663,25838,$530.79,77910 +664,51471,"$9,044.58",56103 +665,22363,$5.70,74692 +666,35147,"$42,238.37",61339 +667,23486,$10.27,30663 +668,9827,"$3,866.40",14737 +669,42337,$5.08,8663 +670,64431,"$97,727.02",20024 +671,63994,$534.93,76092 +672,58775,$430.67,44781 +673,21156,$115.05,77356 +674,12154,"$38,049.48",12244 +675,4220,$23.00,42219 +676,39707,$731.22,21579 +677,9142,$737.08,369 +678,21453,"$9,453.29",28774 +679,43725,"$34,894.68",25537 +680,13182,"$3,765.35",21923 +681,13066,$3.40,66058 +682,9175,$50.49,51691 +683,52396,"$43,276.82",25842 +684,32862,$618.98,6331 +685,35147,$1.04,2877 +686,8714,$4.51,48928 +687,12154,$3.41,67979 +688,39707,$987.21,41015 +689,32862,"$4,537.21",80709 +690,71286,$0.58,32207 +691,57809,$712.32,5422 +692,44905,$2.50,16827 +693,36001,$719.06,56341 +694,50169,"$2,031.47",14701 +695,43725,$437.83,40672 +696,25571,$11.16,79968 +697,67458,$2.23,18911 +698,37644,"$27,988.12",60350 +699,28163,$9.45,75824 +700,57809,$43.84,25928 +701,51257,$70.30,26183 +702,28760,$223.38,8978 +703,26804,$4.23,15749 +704,50554,$3.48,74135 +705,62805,"$4,302.14",36811 +706,21156,$30.48,37035 +707,13169,"$51,841.28",84908 +708,49041,$3.15,63005 +709,5936,$346.90,79887 +710,1869,$90.45,46000 +711,46754,$10.25,6051 +712,12034,$68.07,79887 +713,14037,"$7,145.41",3745 +714,11558,$990.25,63895 +715,37659,$848.28,1973 +716,17149,"$4,935.94",64490 +717,49876,$166.44,38774 +718,60985,$978.70,30007 +719,70682,$2.02,35266 +720,59360,$948.22,77203 +721,9827,"$28,581.13",35981 +722,55472,$577.25,53492 +723,61274,$7.43,959 +724,11558,$261.35,30783 +725,35996,"$43,821.04",68521 +726,43725,$62.55,63735 +727,71286,$129.48,22232 +728,37644,"$3,936.80",76920 +729,69118,$790.84,65189 +730,61591,"$9,441.21",49656 +731,44382,"$46,556.80",81942 +732,37659,$603.56,17095 +733,9175,$16.28,40663 +734,49876,"$9,902.49",72646 +735,6931,"$8,738.17",62900 +736,29586,"$92,608.50",8663 +737,60762,$497.58,4841 +738,17518,$108.04,73995 +739,28699,"$5,996.45",19264 +740,27958,"$5,789.15",48092 +741,60736,"$86,167.62",48928 +742,9142,$1.81,28774 +743,39707,"$37,905.45",16363 +744,31278,$7.52,55225 +745,53735,$77.02,60031 +746,13535,$98.44,47657 +747,29315,$937.88,19264 +748,18662,"$5,659.72",47682 +749,43725,$938.13,15563 +750,57647,"$37,989.28",76556 +751,27122,$62.65,6051 +752,52904,"$9,455.58",65693 +753,9775,$729.76,67979 +754,14037,$28.79,3931 +755,54694,$7.30,15957 +756,17998,$9.57,1633 +757,24955,"$6,044.11",59109 +758,53301,$814.98,45884 +759,6931,$5.01,74692 +760,31666,"$95,081.37",45097 +761,17518,$9.40,21829 +762,60826,$7.74,28483 +763,61591,"$94,654.82",15336 +764,70669,$508.14,36380 +765,31770,$4.87,74986 +766,53572,"$18,746.92",76070 +767,6422,"$7,513.32",28820 +768,62926,$983.87,43873 +769,51279,$746.46,76348 +770,62737,$7.19,10651 +771,57543,$455.88,54138 +772,57256,$3.91,25665 +773,13169,$358.05,85172 +774,29315,"$99,540.63",71449 +775,27521,$1.77,6376 +776,52396,"$89,662.56",25438 +777,51991,"$4,478.38",42447 +778,34271,$41.27,75824 +779,53572,"$8,866.33",70205 +780,44905,$2.93,75449 +781,52904,$61.09,37512 +782,53572,$98.77,9479 +783,73006,$4.25,79995 +784,10383,$5.97,43873 +785,8579,"$95,518.36",59938 +786,21670,$84.79,60416 +787,31278,$4.85,76070 +788,9738,$518.32,72660 +789,31770,$83.40,38060 +790,13065,$656.98,70888 +791,24955,"$1,723.27",46134 +792,21453,$8.98,55122 +793,52396,$8.13,71355 +794,66228,$723.03,71590 +795,13169,$16.42,18733 +796,21453,"$80,794.01",33453 +797,53301,$8.65,10651 +798,19211,$0.53,43982 +799,58775,"$6,688.92",25665 +800,24366,$0.42,28311 +801,7080,"$1,585.00",37931 +802,56876,"$1,977.63",26429 +803,70669,$81.41,21396 +804,70515,$6.16,18974 +805,50314,$4.60,48933 +806,53299,$10.07,74986 +807,52232,$0.23,81131 +808,31770,$57.72,27276 +809,34983,"$6,647.22",51691 +810,63901,$5.99,31255 +811,57304,$98.30,37512 +812,69118,"$55,644.54",75395 +813,4220,$1.41,47682 +814,63322,"$6,330.74",50675 +815,68751,$26.32,23161 +816,4220,$13.20,64458 +817,21330,"$83,432.08",5226 +818,44382,"$70,036.75",65335 +819,46754,$86.16,14002 +820,42679,$98.61,61282 +821,68862,$10.02,31414 +822,32125,"$9,487.46",17621 +823,27958,$24.11,30135 +824,13169,"$22,517.59",36893 +825,38111,"$51,918.25",55490 +826,29023,$660.20,51952 +827,45941,$424.41,70205 +828,32862,$615.62,61282 +829,12154,$7.31,3277 +830,29586,$58.24,36753 +831,48110,$317.57,62835 +832,60826,"$3,394.50",70977 +833,31770,"$13,633.24",56341 +834,68765,$51.75,23203 +835,25838,$2.34,14737 +836,67787,$80.73,37931 +837,9142,$17.68,77960 +838,31666,"$85,246.25",76070 +839,41730,"$96,676.08",3277 +840,64431,$143.45,39400 +841,24955,"$9,203.26",55122 +842,27521,$61.67,19713 +843,44382,$6.65,21829 +844,4220,$685.10,75731 +845,20731,"$53,391.02",959 +846,14846,$30.03,56103 +847,72970,"$98,254.74",15329 +848,13535,$3.40,14737 +849,42779,"$48,617.35",77960 +850,15318,$73.19,59109 +851,67873,$334.62,53099 +852,52098,$85.85,65496 +853,50847,$70.98,6331 +854,32862,"$58,137.84",60350 +855,33954,"$6,733.05",35612 +856,50847,"$6,505.70",16927 +857,28163,$0.42,3724 +858,46754,"$20,876.52",70205 +859,33954,$809.30,3389 +860,34271,$7.80,30663 +861,50830,"$58,682.45",49648 +862,64448,"$7,253.49",8978 +863,19884,$7.58,46530 +864,3109,"$2,143.90",27075 +865,51991,$72.76,26201 +866,16853,$46.45,52202 +867,42057,$5.93,40672 +868,19211,"$72,529.12",24476 +869,62926,"$70,295.89",62555 +870,31666,$9.46,26631 +871,52930,$6.07,42951 +872,43669,"$3,645.64",47011 +873,19211,$232.24,59966 +874,56876,$55.50,60350 +875,38064,"$19,883.49",20250 +876,538,"$65,209.27",42447 +877,4186,"$8,407.62",76070 +878,9720,"$4,699.33",65335 +879,37659,"$9,966.17",62645 +880,6931,"$56,817.29",43722 +881,17518,"$7,055.92",37205 +882,12034,$259.23,26183 +883,27958,$7.79,16827 +884,61274,"$94,425.98",41938 +885,9142,$643.18,22353 +886,10383,$163.84,50675 +887,50847,"$2,987.00",76092 +888,50554,"$99,109.59",72740 +889,9720,$402.08,22573 +890,61772,"$27,013.35",28519 +891,61772,$4.55,39400 +892,20062,"$2,137.43",27276 +893,57544,$30.36,75474 +894,71286,"$4,166.61",61863 +895,1869,"$8,866.35",47518 +896,21670,"$4,952.81",73663 +897,20062,$322.51,14801 +898,38064,"$74,825.56",35085 +899,538,"$7,090.13",65189 +900,38111,"$8,407.40",57191 +901,45401,$47.50,39400 +902,14846,"$68,864.12",9872 +903,28163,$94.22,23161 +904,33520,$7.47,62900 +905,35996,"$2,596.88",40174 +906,52232,"$66,404.57",12082 +907,13182,$692.13,76092 +908,49876,$3.82,66036 +909,1634,$15.52,72660 +910,48937,$22.70,26201 +911,7080,$21.12,8568 +912,29023,$57.99,1580 +913,14846,$3.20,47286 +914,17998,$9.66,10651 +915,4186,"$72,250.16",13117 +916,50830,"$1,037.58",50479 +917,5936,$60.36,79674 +918,54591,"$89,760.03",50592 +919,31278,$78.55,61863 +920,20731,"$82,590.43",60416 +921,3109,$466.80,1973 +922,49876,"$8,770.28",74149 +923,62971,$102.46,7049 +924,8442,$0.48,13407 +925,41864,$286.27,25438 +926,18662,$109.61,81026 +927,51451,$7.07,44713 +928,67787,$483.24,84365 +929,15318,"$62,245.54",75474 +930,73006,$3.32,42447 +931,45401,"$23,249.67",77828 +932,48954,"$2,648.49",26685 +933,64030,"$3,517.86",369 +934,13182,$874.47,68273 +935,72970,$776.32,69651 +936,58775,"$4,572.47",64458 +937,18662,"$43,200.61",69651 +938,53301,"$24,465.01",52675 +939,60826,"$83,693.35",37512 +940,8714,$980.31,2467 +941,51451,"$1,787.94",52202 +942,39707,"$5,896.95",59109 +943,48954,"$1,848.33",18126 +944,21156,$372.68,44610 +945,33954,"$33,314.95",21396 +946,10383,$5.60,65850 +947,72970,$799.64,12082 +948,58050,$187.31,45383 +949,3109,"$6,757.36",36811 +950,59386,"$5,581.62",15161 +951,22057,$460.41,70315 +952,58775,"$80,753.30",64490 +953,60985,$1.82,6739 +954,52098,"$1,354.51",76092 +955,16853,$4.93,59440 +956,13065,"$5,431.32",20618 +957,31278,"$29,775.98",6169 +958,34983,"$54,260.41",21000 +959,42839,"$5,180.66",32348 +960,63901,"$7,298.82",55365 +961,68765,$755.92,47682 +962,50169,"$85,655.21",63718 +963,48937,"$3,510.09",36878 +964,16216,$77.51,77356 +965,22057,"$17,583.65",4841 +966,51991,"$7,670.23",2467 +967,14846,$87.72,69651 +968,67873,$4.99,37035 +969,36001,"$31,864.55",59938 +970,41080,$2.82,47682 +971,46346,$971.47,45884 +972,18727,"$1,441.72",3931 +973,22363,"$3,188.64",22573 +974,66228,"$2,364.20",3253 +975,26804,"$3,770.84",31255 +976,51451,$9.83,61120 +977,12034,"$51,362.17",50713 +978,46708,"$6,053.03",26631 +979,70515,$197.82,84684 +980,33520,$427.79,85293 +981,67787,$612.39,44409 +982,42679,$0.04,6169 +983,31770,$9.74,6513 +984,67873,"$22,791.43",4172 +985,45401,$47.90,63005 +986,13065,$566.29,5133 +987,62926,$6.68,77910 +988,8198,"$23,777.73",60812 +989,42337,$5.53,21923 +990,64431,"$3,089.49",74986 +991,48110,$7.14,77356 +992,49041,$156.72,52675 +993,51451,$528.33,7049 +994,23038,$386.14,32207 +995,48110,"$41,246.73",23588 +996,57809,"$3,998.16",12079 +997,50554,$143.18,30135 +998,39707,$786.54,77079 +999,14278,"$48,105.34",3277 diff --git a/metasyn/distribution/base.py b/metasyn/distribution/base.py index fb6c8e40..26cd28a7 100644 --- a/metasyn/distribution/base.py +++ b/metasyn/distribution/base.py @@ -52,6 +52,8 @@ class BaseFitter(ABC): plugin_version: str = "1.0" def __init__(self, privacy: BasePrivacy): + if not isinstance(privacy, BasePrivacy): + raise TypeError(f"To initialize fitter, supply a Privacy object, not {type(privacy)}.") self.privacy = privacy def fit(self, values: Union[npt.NDArray, pl.Series]) -> BaseDistribution: diff --git a/metasyn/distribution/freetext.py b/metasyn/distribution/freetext.py index 818a9086..1240aabd 100644 --- a/metasyn/distribution/freetext.py +++ b/metasyn/distribution/freetext.py @@ -41,7 +41,7 @@ class FreeTextDistribution(BaseDistribution): """ - def __init__(self, locale: str, avg_sentences: Optional[float], avg_words: float): + def __init__(self, locale: str, avg_sentences: float, avg_words: float): self.locale: str = locale self.avg_sentences = avg_sentences self.avg_words = avg_words @@ -49,7 +49,7 @@ def __init__(self, locale: str, avg_sentences: Optional[float], avg_words: float def draw(self): - if self.avg_sentences is None: + if self.avg_sentences is None or self.avg_sentences < 0.01: n_words = max(1, poisson(self.avg_words).rvs()) sentence = self.fake.sentence(n_words) return sentence[:-1] @@ -110,7 +110,7 @@ def _fit(self, series, max_values: int = 50): n_punctuation = len(list(PUNCTUATION.finditer(all_text))) n_words = len(list(LETTERS.finditer(all_text))) if n_punctuation < n_non_empty//3: - avg_sentence = None + avg_sentence = 0.0 else: avg_sentence = n_punctuation/len(series) avg_words = n_words/len(series) diff --git a/metasyn/distribution/normal.py b/metasyn/distribution/normal.py index b765a1d2..b67a32c2 100644 --- a/metasyn/distribution/normal.py +++ b/metasyn/distribution/normal.py @@ -149,8 +149,10 @@ class ContinuousTruncatedNormalFitter(BaseFitter): """Fitter for continuous truncated normal fitter.""" def _fit(self, series): - lower = series.min() - 1e-8 - upper = series.max() + 1e-8 + lower = series.min() + upper = series.max() + if lower == upper: + return self.distribution(lower-1e-8, upper+1e-8, lower, 1) return self._fit_with_bounds(series, lower, upper) def _fit_with_bounds(self, values, lower, upper): @@ -190,6 +192,10 @@ class DiscreteNormalDistribution(ContinuousNormalDistribution): def draw(self): return int(super().draw()) + @classmethod + def default_distribution(cls, var_type=None): # noqa: ARG003 + return cls(2, 20) + @builtin_fitter(distribution=DiscreteNormalDistribution, var_type="discrete") class DiscreteNormalFitter(ScipyFitter): @@ -219,6 +225,12 @@ class DiscreteTruncatedNormalDistribution(ContinuousTruncatedNormalDistribution) def draw(self): return int(super().draw()) + @classmethod + def default_distribution(cls, var_type=None): # noqa: ARG003 + return cls(2, 20, 0, 50) + + + @builtin_fitter(distribution=DiscreteTruncatedNormalDistribution, var_type="discrete") class DiscreteTruncatedNormalFitter(ContinuousTruncatedNormalFitter): """Fitter for discrete truncated normal distribution.""" diff --git a/metasyn/file.py b/metasyn/file.py index 6862ecda..cd3af60a 100644 --- a/metasyn/file.py +++ b/metasyn/file.py @@ -7,6 +7,7 @@ from typing import Any, Optional, Type, Union import polars as pl +from tqdm import tqdm _AVAILABLE_FILE_INTERFACES = {} @@ -202,22 +203,24 @@ def _read_data(cls, fp, max_rows=None, chunk_size=None): _, metadata = prs_func(fp, metadataonly=True) n_rows = metadata.number_rows - if max_rows >= 2*n_rows: # Not enough rows to used chunked sampling, read first max_rows + + # Number of chunks is maximum number of rows divided by chunksize, rounded up + n_chunks = ((max_rows-1) // chunk_size) + 1 + # Starts of chunks are separated by total number of rows divided by number of chunks + skip_size = n_rows // n_chunks + if skip_size <= chunk_size: # Sampling useless when all chunks are packed together return prs_func(fp, apply_value_formats=True, output_format="polars", row_limit=max_rows) - skip_factor = n_rows // max_rows all_df = [] - i_chunk = 0 - for temp_df, prs_meta in pyreadstat.read_file_in_chunks( - prs_func, fp, apply_value_formats=True, output_format="polars", - chunksize=chunk_size): - # Done - if (i_chunk//skip_factor)*chunk_size >= max_rows: - break - if i_chunk % skip_factor == 0: - all_df.append(temp_df) - i_chunk += 1 + + disable = max_rows < 1000 + for i_row in tqdm(range(0, n_rows, skip_size), disable=disable): + # If we need less than a chunk + row_limit = min(chunk_size, max_rows - (i_row//skip_size)*chunk_size) + temp_df, prs_meta = prs_func(fp, row_offset=i_row, row_limit=row_limit, + apply_value_formats=True, output_format="polars") + all_df.append(temp_df) return pl.concat(all_df, how="vertical_relaxed"), prs_meta @@ -225,7 +228,6 @@ def _read_data(cls, fp, max_rows=None, chunk_size=None): def _get_df_metadata(cls, fp: Union[Path, str], **kwargs): """Read the dataset including the metadata.""" df, prs_metadata = cls._read_data(fp, **kwargs) - # df = pl.DataFrame(pandas_df) return cls._convert_with_orig_format(df, prs_metadata), prs_metadata @@ -327,10 +329,13 @@ def _extract_metadata(cls, prs_metadata, fp): "variable_format": prs_metadata.original_variable_types, "compress": compress, "variable_display_width": prs_metadata.variable_display_width, - "file_label": prs_metadata.file_label, "variable_value_labels": prs_metadata.variable_value_labels, "variable_measure": prs_metadata.variable_measure, } + # Workaround for TOML files that don't like None values. + if prs_metadata.file_label is not None: + metadata["file_label"] = prs_metadata.file_label + return metadata @@ -378,9 +383,12 @@ def _extract_metadata(cls, prs_metadata, fp): # noqa: ARG003 metadata = { "column_labels": prs_metadata.column_labels, "variable_format": prs_metadata.original_variable_types, - "file_label": prs_metadata.file_label, "variable_value_labels": prs_metadata.variable_value_labels, } + # Workaround for TOML files that don't like None values. + if prs_metadata.file_label is not None: + metadata["file_label"] = prs_metadata.file_label + return metadata def _prep_df_for_writing(self, df): diff --git a/metasyn/gmf.py b/metasyn/gmf.py new file mode 100644 index 00000000..119b17f4 --- /dev/null +++ b/metasyn/gmf.py @@ -0,0 +1,266 @@ +"""The validation module contains functions to validate the serialized output of distributions. + +This ensures that the Generative Metadata Format (GMF) files are interoperable and well formed. +""" + +from __future__ import annotations + +import warnings +from abc import ABC +from copy import deepcopy +from importlib.metadata import entry_points + +import jsonschema + +from metasyn.distribution.na import NADistribution +from metasyn.registry import DistributionRegistry + +SCHEMA_BASE_v11 = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "http://sodascience.github.io/generative_metadata_format/core/1.1/generative_metadata_format", # noqa: E501 + "type": "object", + "properties": { + "gmf_version": {"type": "string"}, + "n_rows": {"type": "number"}, + "n_columns": {"type": "number"}, + "provenance": { + "type": "object", + "properties": { + "created by": {"type": "object"}, + "creation time": {"type": "string"} + }, + "required": ["created by", "creation time"] + }, + "vars": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "description": {"type": "string"}, + "type": {"enum": ["discrete", "continuous", "string", "categorical", "date", + "datetime", "time"]}, + "dtype": {"type": "string"}, + "prop_missing": {"type": "number"}, + "distribution": { + "$ref": "#/$defs/all_dist_def" + } + } + }, + "required": ["name", "type", "dtype", "provenance", "prop_missing", "distribution"] + } + }, + "required": ["n_rows", "n_columns", "vars"], +} + + +SCHEMA_BASE_v2 = { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "http://sodascience.github.io/generative_metadata_format/core/2.0/generative_metadata_format", # noqa: E501 + "type": "object", + "properties": { + "gmf_version": {"type": "string"}, + "relations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "primary_table": {"type": "string"}, + "primary_key": {"type": "string"}, + "foreign_table": {"type": "string"}, + "foreign_key": {"type": "string"}, + "relation_type": {"type": "string"} + }, + "required": ["primary_table", "primary_key", "foreign_table", "foreign_key", + "relation_type"] + }, + }, + "provenance": { + "type": "object", + "properties": { + "created by": {"type": "object"}, + "creation time": {"type": "string"} + }, + "required": ["created by", "creation time"] + }, + "tables" : { + "type": "array", + "items": { + "type": "object", + "properties": { + "n_rows": {"type": "number"}, + "n_columns": {"type": "number"}, + "vars": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "description": {"type": "string"}, + "type": {"enum": ["discrete", "continuous", "string", "categorical", + "date", "datetime", "time"]}, + "dtype": {"type": "string"}, + "prop_missing": {"type": "number"}, + "distribution": { + "$ref": "#/$defs/all_dist_def" + } + } + }, + "required": ["name", "type", "dtype", "provenance", "prop_missing", + "distribution"] + } + }, + "required": ["n_rows", "n_columns", "vars"], + } + }, + }, +} + + + +class BaseGmfParser(ABC): + """Base class for parsing GmfFiles.""" + + base_schema: dict = {} + versions: list[str] = [] + + + def distribution_schema(self, packages: list[str]) -> list[dict]: + defs: list[dict] = [] + for fitter in DistributionRegistry.parse(packages).fitters: + defs.append(fitter.distribution.schema()) + defs.append(NADistribution.schema()) + return defs + + def create_schema(self, packages: list[str]) -> dict: + """Create JSON Schema to validate a GMF file. + + Arguments + --------- + packages: + List of packages to create the schema with. + + Returns + ------- + schema: + Schema containing all the distributions in the distribution packages. + """ + defs: list[dict] = [] + for fitter in DistributionRegistry.parse(packages).fitters: + defs.append(fitter.distribution.schema()) + defs.append(NADistribution.schema()) + + schema = deepcopy(self.base_schema) + schema.update({"$defs": {"all_dist_def": {"anyOf": defs}}}) + return schema + + def parse(self, gmf_dict: dict): + """Convert the specific version of the gmf dictionary to the current standard. + + Parameters + ---------- + gmf_dict + Dictionary to be converted. + + Returns + ------- + Converted dictionary. + """ + return gmf_dict + + def validate_gmf_dict(self, gmf_dict: dict): + """Validate a JSON dictionary of a metaframe as it would be written to a GMF file. + + Make sure that you have used the _jsonify function to convert numpy arrays to + lists, etc. + + Arguments + --------- + gmf_dict: + Dictionary containing the metasyn output for a metaframe. + """ + packages = [entry.name for entry in entry_points(group="metasyn.distribution_registry")] + schema = self.create_schema(packages) + jsonschema.validate(gmf_dict, schema) + +class GmfV11Parser(BaseGmfParser): + """GMF parser for version 1.1 and earlier.""" + + versions: list[str] = ["1.1"] + base_schema: dict = SCHEMA_BASE_v11 + + def parse(self, gmf_dict: dict): + new_gmf_dict = deepcopy(gmf_dict) + n_rows = new_gmf_dict.pop("n_rows") + n_cols = new_gmf_dict.pop("n_columns") + vars = new_gmf_dict.pop("vars") + + new_gmf_dict["tables"] = [ + { + "name": "single_table", + "n_rows": n_rows, + "n_columns": n_cols, + "vars": vars + } + ] + return new_gmf_dict + +class GmfV20Parser(BaseGmfParser): + """GMF parser for version 2.0 and later.""" + + versions: list[str] = ["2.0", "*"] + base_schema: dict = SCHEMA_BASE_v2 + +def _get_parser_class(gmf_dict: dict) -> type[BaseGmfParser]: + version = gmf_dict.get("gmf_version", "1.1") + + all_parsers = [GmfV11Parser, GmfV20Parser] + + best_parser_class = None + for parser_class in all_parsers: + if version in parser_class.versions: + best_parser_class = parser_class + break + + if best_parser_class is None: + for par in all_parsers: + if "*" in par.versions: + best_parser_class = par + break + + warnings.warn("Reading GMF file with unknown GMF version, update metasyn to ensure correct " + "reading of the GMF file.") + assert best_parser_class is not None + return best_parser_class + +def validate_gmf_dict(gmf_dict: dict): + """Validate gmf dictionary as read from the file for different GMF versions. + + Parameters + ---------- + gmf_dict + Dictionary to be validated. + """ + parser = _get_parser_class(gmf_dict)() + parser.validate_gmf_dict(gmf_dict) + +def parse_gmf_dict(gmf_dict: dict, validate: bool = True) -> dict: + """Parse GMF dictionary to convert it to the standard dictionary. + + This acts as a compatibility layer for different versions of the GMF standard. + + Parameters + ---------- + gmf_dict: + Dictionary to be converted + validate: + Whether to validate the dictionary with the JSON schema, by default True + + Returns + ------- + A potentially validated and converted dictionary. + """ + parser = _get_parser_class(gmf_dict)() + if validate: + parser.validate_gmf_dict(gmf_dict) + return parser.parse(gmf_dict) diff --git a/metasyn/metaframe.py b/metasyn/metaframe.py index 80540c5f..4f647003 100644 --- a/metasyn/metaframe.py +++ b/metasyn/metaframe.py @@ -22,9 +22,9 @@ from metasyn.config import MetaConfig from metasyn.file import BaseFileInterface, file_interface_from_dict +from metasyn.gmf import parse_gmf_dict from metasyn.privacy import BasePrivacy, get_privacy from metasyn.util import set_global_seeds -from metasyn.validation import validate_gmf_dict from metasyn.var import MetaVar from metasyn.varspec import VarSpec @@ -61,11 +61,13 @@ def __init__( meta_vars: List[MetaVar], n_rows: Optional[int] = None, file_format: Union[None, BaseFileInterface, dict[str, Any]] = None, + name: str = "single_table" ): self.meta_vars = meta_vars self.n_rows = n_rows self._file_format: Union[None, dict[str, Any]] self.file_format = file_format # type: ignore + self.name = name @property def n_columns(self) -> int: @@ -83,6 +85,7 @@ def fit_dataframe( # noqa: PLR0912 progress_bar: bool = True, config: Optional[Union[pathlib.Path, str, MetaConfig]] = None, file_format: Union[dict[str, Any], BaseFileInterface, None] = None, + name: str = "single_table", ): """Create a metasyn object from a polars (or pandas) dataframe. @@ -189,9 +192,9 @@ def fit_dataframe( # noqa: PLR0912 raise ValueError( "Please provide the number of rows in the configuration, or supply a DataFrame." ) - return cls(all_vars, meta_config.n_rows, file_format) + return cls(all_vars, meta_config.n_rows, file_format, name=name) n_rows = len(df) if n_rows is None else n_rows - return cls(all_vars, n_rows, file_format) + return cls(all_vars, n_rows, file_format, name=name) @classmethod def from_config(cls, meta_config: MetaConfig) -> MetaFrame: @@ -211,9 +214,7 @@ def from_config(cls, meta_config: MetaConfig) -> MetaFrame: def to_dict(self) -> Dict[str, Any]: """Create dictionary with the properties for recreation.""" self_dict = { - "gmf_version": "1.1", - "n_rows": self.n_rows, - "n_columns": self.n_columns, + "gmf_version": "2.0", "provenance": { "created by": { "name": "metasyn", @@ -221,13 +222,37 @@ def to_dict(self) -> Dict[str, Any]: }, "creation time": datetime.now().isoformat(), }, - "file_format": self.file_format, - "vars": [var.to_dict() for var in self.meta_vars], + "tables": [ + { + "name": self.name, + "n_rows": self.n_rows, + "n_columns": self.n_columns, + "file_format": self.file_format, + "vars": [var.to_dict() for var in self.meta_vars], + } + ] } if self.file_format is None: - self_dict.pop("file_format") + self_dict["tables"][0].pop("file_format") # type: ignore return self_dict + @classmethod + def from_dict(cls, gmf_dict: dict, table_name: Optional[str] = None): + if table_name is None: + table_name = gmf_dict["tables"][0]["name"] + + table_dict = None + for cur_table_dict in gmf_dict["tables"]: + if cur_table_dict["name"] == table_name: + table_dict = cur_table_dict + break + if table_dict is None: + raise KeyError(f"Cannot find table with name {table_name} in GMF file.") + + n_rows = table_dict["n_rows"] + meta_vars = [MetaVar.from_dict(d) for d in table_dict["vars"]] + return cls(meta_vars, n_rows, table_dict.get("file_format"), name=table_name) + def __getitem__(self, key: Union[int, str]) -> MetaVar: """Return meta var either by variable name or index.""" if isinstance(key, int): @@ -307,7 +332,8 @@ def save(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = True) -> self.save_json(fp, validate) @classmethod - def load(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> MetaFrame: + def load(cls, fp: Union[pathlib.Path, str], validate: bool = True, + table_name: Optional[str] = None) -> MetaFrame: """Read a MetaFrame from a JSON or TOML GMF file. Optionally, validate the saved JSON file against the JSON schema(s) included in the @@ -328,9 +354,9 @@ def load(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> MetaFrame: """ fp_path = Path(fp) if fp_path.suffix == ".toml": - return cls.load_toml(fp, validate) + return cls.load_toml(fp, validate, table_name=table_name) else: - return cls.load_json(fp, validate) + return cls.load_json(fp, validate, table_name=table_name) def save_json(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = True) -> None: """Serialize and save the MetaFrame to a JSON file, following the GMF format. @@ -347,7 +373,7 @@ def save_json(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = Tru """ self_dict = _jsonify(self.to_dict()) if validate: - validate_gmf_dict(self_dict) + parse_gmf_dict(self_dict, validate=True) if fp is None: print(json.dumps(self_dict, indent=4)) else: @@ -355,7 +381,8 @@ def save_json(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = Tru json.dump(self_dict, f, indent=4) @classmethod - def load_json(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> MetaFrame: + def load_json(cls, fp: Union[pathlib.Path, str, dict], validate: bool = True, + table_name: Optional[str] = None) -> MetaFrame: """Read a MetaFrame from a JSON file. Parameters @@ -370,15 +397,14 @@ def load_json(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> MetaF MetaFrame: A restored MetaFrame from the file. """ - with open(fp, "r", encoding="utf-8") as f: - self_dict = json.load(f) - - if validate: - validate_gmf_dict(self_dict) + if isinstance(fp, dict): + self_dict = fp + else: + with open(fp, "r", encoding="utf-8") as f: + self_dict = json.load(f) - n_rows = self_dict["n_rows"] - meta_vars = [MetaVar.from_dict(d) for d in self_dict["vars"]] - return cls(meta_vars, n_rows, self_dict.get("file_format")) + self_dict = parse_gmf_dict(self_dict, validate=validate) + return cls.from_dict(self_dict, table_name=table_name) def to_json(self, fp: Union[pathlib.Path, str], validate: bool = True) -> None: """Export, deprecated method, use Metaframe.save_json instead.""" @@ -421,9 +447,10 @@ def save_toml(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = Tru ) self_dict = _jsonify(self.to_dict()) if validate: - validate_gmf_dict(self_dict) + parse_gmf_dict(self_dict, validate=True) - doc = tomlkit.loads(tomlkit.dumps(self_dict)) + all_doc = tomlkit.loads(tomlkit.dumps(self_dict)) + doc = all_doc["tables"][0] doc["n_rows"].comment("Number of rows") doc["n_columns"].comment("""Number of columns @@ -469,28 +496,22 @@ def save_toml(self, fp: Optional[Union[pathlib.Path, str]], validate: bool = Tru elif "privacy" in var.creation_method: privacy = get_privacy(**var.creation_method["privacy"]) parameter_comments.append(privacy.comment(var)) - if var.distribution.matches_name("multinoulli") and not multi_default: - counts = (var.distribution.probs * (1 - var.prop_missing) * self.n_rows).round() - parameter_comments.append(f"Counts: {counts.astype(int)}\n") par_comment = "\n# ".join(parameter_comments) + "\n\n" doc["vars"][i]["distribution"]["parameters"].add(tomlkit.comment(par_comment)) if fp is None: - print(tomlkit.dumps(doc)) + print(tomlkit.dumps(all_doc)) else: with open(fp, "w", encoding="utf-8") as f: - tomlkit.dump(doc, f) + tomlkit.dump(all_doc, f) @classmethod - def load_toml(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> MetaFrame: + def load_toml(cls, fp: Union[pathlib.Path, str], validate: bool = True, + table_name: Optional[str] = None) -> MetaFrame: with open(fp, "rb") as f: self_dict = tomllib.load(f) - if validate: - validate_gmf_dict(self_dict) - - n_rows = self_dict["n_rows"] - meta_vars = [MetaVar.from_dict(d) for d in self_dict["vars"]] - return cls(meta_vars, n_rows, self_dict.get("file_format")) + self_dict = parse_gmf_dict(self_dict, validate=validate) + return cls.from_dict(self_dict, table_name=table_name) def synthesize( self, diff --git a/metasyn/multiframe.py b/metasyn/multiframe.py new file mode 100644 index 00000000..1de079a4 --- /dev/null +++ b/metasyn/multiframe.py @@ -0,0 +1,447 @@ +"""Multi dataframe functionality for metasyn.""" +import json +import pathlib +import re +import warnings +from copy import deepcopy +from dataclasses import dataclass +from enum import Enum +from typing import Any, Optional, Union + +import polars as pl + +from metasyn.gmf import parse_gmf_dict, validate_gmf_dict +from metasyn.metaframe import MetaFrame, _jsonify + + +class RelationType(Enum): + """Enumeration for the different relation types between columns. + + There are multiple types of relations that have different associated symbols: + Subset (``SUBSET OF``), Equal (``EQUALS``), EqualOrdered (``EQUAL ORDERED``) + and Infer (``INFER FROM``). + Subset means that the foreign column contains values from the primary column, but + not all values from the primary column need to be present in the foreign column. + Equal means that all values in the primary column are present in the foreign column + exactly once, but not necessarily in the same order. EqualOrdered is the same as Equal + except that they are also present in the same order. Infer means that it is unknown + which of the different relation types is the correct one and that this is still to be + inferred. + """ + + Subset = "SUBSET OF" + Equal = "EQUALS" + EqualOrdered = "EQUAL ORDERED" + Infer = "INFER FROM" + + def __str__(self): + return self.value + + @classmethod + def parse(cls, symbol: str) -> "RelationType": + match symbol.upper(): + case "SUBSET OF": + return cls.Subset + case "EQUALS": + return cls.Equal + case "EQUAL ORDERED": + return cls.EqualOrdered + case "INFER FROM": + return cls.Infer + raise ValueError(f"Cannot parse relation type '{symbol}': symbol unknown.") + +def _create_re(name): + return r"(?P<" + name + r">(?:\\\]|\\\[|[^\s\]\[]|[\s])+)" + +def _unescape(result): + return result.replace(r"\[", "[").replace(r"\]", "]") + +@dataclass +class ColumnRelation(): + """Specification of how two columns relate to each other for multiframe inference. + + The easiest way to specify the relation between two columns is use the + :meth:`ColumnRelation.parse` method. + """ + + foreign_table: str + foreign_key: str + primary_table: str + primary_key: str + relation_type: RelationType = RelationType.Infer + + + def __post_init__(self): + if self.primary_key == self.foreign_key and self.primary_table == self.foreign_table: + raise ValueError("Cannot have a primary <-> foreign key relation between the " + "same table and column.") + + def __str__(self): + return (f"{self.foreign_table}[{self.foreign_key}] {self.relation_type}" + f" {self.primary_table}[{self.primary_key}]") + + @classmethod + def parse(cls, relation_str: str) -> "ColumnRelation": + """Parse a string to convert it into a column relation. + + Parameters + ---------- + relation_str + String of the form primary_table[primary_column] {relation type} + foreign_table[foreign_column]. See :class:`RelationType` for the different relations + types. Note that the tables and columns can have spaces. + + Raises + ------ + ValueError: + If the relation string cannot be parsed. + + Returns + ------- + An initialized column relation. + """ + regex = re.compile( + _create_re("ftab") + r"\[" + _create_re("fcol") + + r"\]\s+(?PSUBSET OF|EQUALS|INFER FROM|EQUAL ORDERED)\s?" + + _create_re("ptab") + r"\[" + _create_re("pcol") + r"\]" + ) + match = regex.match(relation_str) + if match is None: + raise ValueError(f"Cannot parse relation '{relation_str}'. It should be of the form:" + " table_a[foreign_column] SUBSET OF table_b[primary_column].") + return cls( + foreign_table = _unescape(match.group("ftab")), + foreign_key = _unescape(match.group("fcol")), + primary_table = _unescape(match.group("ptab")), + primary_key = _unescape(match.group("pcol")), + relation_type = RelationType.parse(match.group("rel")) + ) + + def to_dict(self): + """Convert the column relation to a dictionary. + + Used mainly for serialization to json. + + Returns + ------- + Dictionary containing the required information of the column relation. + """ + return { + "foreign_table": self.foreign_table, + "foreign_key": self.foreign_key, + "primary_table": self.primary_table, + "primary_key": self.primary_key, + "relation_type": str(self.relation_type), + } + + @classmethod + def from_dict(cls, col_dict: dict[str, Any]) -> "ColumnRelation": + """Create ColumnRelation from a serialized dictionary. + + Mainly used for deserializing from json files. + + Parameters + ---------- + col_dict + Dictionary containing the specifications of a column relation. + + Returns + ------- + A newly initialized column relation. + """ + new_col_dict = deepcopy(col_dict) + new_col_dict["relation_type"] = RelationType(col_dict["relation_type"]) + return cls(**new_col_dict) + +def _validate_relations(relations: list[ColumnRelation], mf_or_df_dict): + columns = {} + for name, mf_or_df in mf_or_df_dict.items(): + if isinstance(mf_or_df, MetaFrame): + columns[name] = [var.name for var in mf_or_df.meta_vars] + else: + columns[name] = mf_or_df.columns + + for rel in relations: + if rel.primary_table not in mf_or_df_dict: + raise ValueError(f"Cannot find table with name {rel.primary_table}, " + f"available: {list(mf_or_df)}.") + if rel.primary_key not in columns[rel.primary_table]: + raise ValueError( + f"Cannot find column '{rel.primary_key}' in table " + f"'{rel.primary_table}, available columns: {columns[rel.primary_table]}'") + if rel.foreign_table not in mf_or_df_dict: + raise ValueError(f"Cannot find table with name {rel.foreign_table}.") + if rel.foreign_key not in columns[rel.foreign_table]: + raise ValueError( + f"Cannot find column '{rel.foreign_key}' in table " + f"'{rel.foreign_table}, available columns: {columns[rel.foreign_table]}'") + for other_rel in relations: + if (rel.primary_table == other_rel.foreign_table + and rel.primary_key == other_rel.foreign_key): + raise ValueError(f"Column in {rel.primary_table}: {rel.primary_key} cannot be " + "a foreign and primary key at the same time.") + if (isinstance(mf_or_df_dict[rel.primary_table], pl.DataFrame) + and not mf_or_df_dict[rel.primary_table][rel.primary_key].is_unique().all()): + warnings.warn(f"Column '{rel.primary_key}' in table '{rel.primary_table}' is a " + "primary key, but not unique.") + +def _infer_relations(relations, dfs_dict): + """For all relations that have RelationType.Infer try to guess the relation. + + This only works if the dataframe objects are provided. + """ + for rel in relations: + if rel.relation_type != RelationType.Infer: + continue + if dfs_dict is None: + raise ValueError("Cannot infer any relations without the original dataframes.") + primary_series = dfs_dict[rel.primary_table][rel.primary_key] + foreign_series = dfs_dict[rel.foreign_table][rel.foreign_key] + if (len(primary_series) == len(foreign_series) + and (primary_series == foreign_series).all()): + rel.relation_type = RelationType.EqualOrdered + elif (len(primary_series) == len(foreign_series) + and (primary_series.sort() == foreign_series.sort()).all()): + rel.relation_type = RelationType.Equal + elif (pl.union((primary_series, foreign_series)).unique().len() + == primary_series.unique().len()): + rel.relation_type = RelationType.Subset + else: + raise ValueError(f"Cannot infer relation type for relation {rel}, possible issues:" + " new item in foreign table.") + + + +class MultiFrame(): + """Generation of multiple synthetic data frames. + + This class implements the generation of multiple synthetic data frames with + relations between columns. + """ + + def __init__(self, metaframes: dict, relations: list[ColumnRelation], + dataframes: Optional[dict[str, pl.DataFrame]] = None): + """Initialize the MultiFrame object. + + Parameters + ---------- + metaframes: + A dictionary containing metaframes to make a multi metaframe from. + The keys are used to identify the tables, but can be freely chosen as strings. + You can choose for example the keys to be the names of the tables or the files + in which they are stored. + relations: + A list of relations between columns, see :class:`ColumnRelations`. + dataframes: + Dataframes from which the metaframes were generated. By default None, + in which case relations cannot be inferred from the data. + """ + self.metaframes = metaframes + self.dfs = dataframes + self.relations = [ColumnRelation.parse(rel) if isinstance(rel, str) else rel + for rel in relations] + _validate_relations(self.relations, metaframes if dataframes is None else dataframes) + _infer_relations(self.relations, dataframes) + + def __getitem__(self, key: str): + if key not in self.metaframes: + raise KeyError(f"No table/metaframe with '{key}' available, choose from " + f"{list(self.metaframes)}.") + return self.metaframes[key] + + def __str__(self) -> str: + all_str = "" + for key, mf in self.metaframes.items(): + mf_str = f"Table {key}:\n" + mf_str += f" Number of rows: {mf.n_rows}\n" + mf_str += f" Number of columns: {mf.n_columns}\n" + col_names = [var.name for var in mf.meta_vars][:5] + if len(col_names) < len(mf.meta_vars): + col_names[-1] == "..." + mf_str += f" Columns: {', '.join(col_names)}\n" + all_str += mf_str + "\n" + if len(self.relations) > 0: + all_str += "Relations between columns:\n" + for rel in self.relations: + all_str += " " + str(rel) + "\n" + return all_str + + def synthesize(self, n: Optional[dict] = None) -> dict[str, pl.DataFrame]: + """Synthesize multiple tables. + + Parameters + ---------- + n: + Number of rows to synthesize. The number of rows for each table is individually + set using a dictionary, so for example for table 'x' with 10 rows, do ``n = {'x': 10}``. + + Returns + ------- + A dictionary with the synthesized dataframes. + + Raises + ------ + ValueError + When the combination of data frames do not have the right number of rows. + For example when one relation has the equal relation type, columns in both tables + should have the same number of rows. + ValueError + When one of the relations has a relation type that is unknown or RelationType.Infer. + """ + if n is None: + n = {} + + # Check whether the number of rows between tables is compatible with the relations. + n_rows = {key: n.get(key, self.metaframes[key].n_rows) for key in self.metaframes} + for rel in self.relations: + if rel.relation_type in (RelationType.Equal, RelationType.EqualOrdered): + nrow_prime, nrow_for = n_rows[rel.primary_table], n_rows[rel.foreign_table] + if nrow_prime != nrow_for: + raise ValueError( + f"Cannot synthesize multiframe, because table {rel.primary_table}" + f"({nrow_prime}) and table {rel.foreign_table}({nrow_for}) should have " + f"the same number of rows, since column {rel.primary_key} and " + f"{rel.foreign_key} should have the same number of rows.") + + # Generate the first version of the synthetic tables. + dfs = {key: mf.synthesize(n_rows[key]) for key, mf in self.metaframes.items()} + + # Implement the relations. + for rel in self.relations: + cur_n = len(dfs[rel.foreign_table]) + primary_series = dfs[rel.primary_table][rel.primary_key] + if rel.relation_type == RelationType.EqualOrdered: + dfs[rel.foreign_table] = dfs[rel.foreign_table].with_columns( + **{rel.foreign_key: primary_series.head(cur_n)}) + elif rel.relation_type == RelationType.Equal: + dfs[rel.foreign_table] = dfs[rel.foreign_table].with_columns( + **{rel.foreign_key: primary_series.sample( + cur_n, with_replacement=False, shuffle=True)}) + elif rel.relation_type == RelationType.Subset: + dfs[rel.foreign_table] = dfs[rel.foreign_table].with_columns( + **{rel.foreign_key: primary_series.sample( + cur_n, with_replacement=True, shuffle=True)}) + + else: + raise ValueError(f"Unknown relation: {rel.relation_type}, choose one of " + "RelationType.Subset, RelationType.Equal, " + "RelationType.EqualOrdered") + return dfs + + def save_json(self, fp: Optional[Union[pathlib.Path, str]] = None, validate: bool = True): + """Save the MultiFrame object to a file. + + Parameters + ---------- + fp: + File to save the metadata to. If left at None, it will print it instead. + """ + relations = [rel.to_dict() for rel in self.relations] + json_dict = {"gmf_version": "", "provenance": {}, "relations": relations, "tables": []} + for name, mf in self.metaframes.items(): + meta_dict = _jsonify(mf.to_dict()) + table = meta_dict.pop("tables") + json_dict.update(meta_dict) + json_dict["tables"].extend(table) # type: ignore + + if validate: + validate_gmf_dict(json_dict) + if fp is None: + print(json.dumps(json_dict, indent=4)) + else: + with open(fp, "w", encoding="utf=8") as f: + json.dump(json_dict, f, indent=4) + + @classmethod + def load_json(cls, fp: Union[pathlib.Path, str], validate: bool = True) -> "MultiFrame": + """Create a MultiFrame from a file with metadata. + + Parameters + ---------- + fp: + File that contains the data to create the MultiFrame. + + Returns + ------- + An initialized MultiFrame. + """ + with open(fp, "r", encoding="utf-8") as handle: + json_dict = json.load(handle) + json_dict = parse_gmf_dict(json_dict, validate=validate) + + relations = [ColumnRelation.from_dict(rel) for rel in json_dict["relations"]] + metaframes = {mf_dict["name"]: MetaFrame.load_json(json_dict, table_name=mf_dict["name"], + validate=validate) + for mf_dict in json_dict["tables"]} + return cls(metaframes, relations) + + def save(self, fp: Optional[Union[pathlib.Path, str]]): + """Save the MultiFrame to a file. + + Parameters + ---------- + fp + File to save to. + """ + self.save_json(fp) + + + @classmethod + def load(cls, fp: Union[pathlib.Path, str]) -> "MultiFrame": + """Load a MultiFrame from a GMF file. + + Parameters + ---------- + fp + GMF file to read. + + Returns + ------- + A multiframe read from the GMF file. + """ + return cls.load_json(fp) + + @classmethod + def fit_dataframes(cls, dataframes: dict[str, pl.DataFrame], relations: list[ColumnRelation], + extra_kwargs: Optional[dict[str, dict]] = None, + **global_kwargs) -> "MultiFrame": + """Fit multiple dataframes to create a MultiFrame. + + Parameters + ---------- + dataframes: + Dictionary of dataframes that contain the tables to be fitted. The keys in the + dictionary are used for defining the relations between columns in different tables. + relations: + Relations between different columns, where primary/foreign key relationships are + defined. + extra_kwargs: + Extra keyword arguments to be supplied for fitting each of the individual dataframes. + If supplied, this should be a dictionary of dictionaries, where the first dictionary + has keys that correspond to the keys of the dataframes. + global_kwargs: + Extra keyword arguments applied to all dataframes equally. This gets overridden by the + the extra_kwargs keyword argument if supplied for individual dataframes. + + Returns + ------- + A fitted multiframe object, containing the metadata for all tables and their + relationships. + """ + extra_kwargs = {} if extra_kwargs is None else extra_kwargs + for key in extra_kwargs: + if key not in dataframes: + raise ValueError(f"Key '{key}' is not the name of a dataframe supplied with the " + f"dataframe argument. Available tables: {list(dataframes)}.") + relations = [ColumnRelation.parse(rel) if isinstance(rel, str) else rel + for rel in relations] + _validate_relations(relations, dataframes) + _infer_relations(relations, dataframes) + mfs = {} + for name, df in dataframes.items(): + cur_extra_kwargs = extra_kwargs.get(name, {}) + cur_kwargs = deepcopy(global_kwargs) + cur_kwargs.update(cur_extra_kwargs) + mfs[name] = MetaFrame.fit_dataframe(df, **cur_kwargs, name=name) + + return cls(mfs, relations, dataframes) diff --git a/metasyn/testutils.py b/metasyn/testutils.py index 40e5423f..f94ad68c 100644 --- a/metasyn/testutils.py +++ b/metasyn/testutils.py @@ -18,6 +18,7 @@ from metasyn.distribution.base import BaseDistribution, BaseFitter from metasyn.distribution.categorical import MultinoulliDistribution from metasyn.distribution.na import NADistribution +from metasyn.gmf import parse_gmf_dict from metasyn.metaframe import _jsonify from metasyn.privacy import BasePrivacy from metasyn.registry import ( @@ -111,17 +112,23 @@ def check_fitter(fitter: type[BaseFitter], privacy: BasePrivacy, ## General +- Version of GMF format: {gmf_version} +- Generated by {program_name} version {program_version} at {generation_time} +- Number of tables: {n_tables} +""" + +TABLE_TEMPLATE = """ +## Details for table {table_name} + - Number of rows: {n_rows} - Number of columns: {n_columns} -- Generated by {program_name} version {program_version} at {generation_time} -## Variables / columns +### Columns: """ - VAR_TEMPLATE = """ -### {var_name} +#### {var_name} - Distribution {class_name} with parameters: {parameters} @@ -137,53 +144,72 @@ def create_md_report(file_name, out_md_file): """Create markdown report from GMF file.""" with open(file_name, "r") as handle: gmf_dict = json.load(handle) + gmf_dict = parse_gmf_dict(gmf_dict, validate=True) header = HEADER_TEMPLATE.format( file_name=file_name, - n_rows=gmf_dict["n_rows"], - n_columns=gmf_dict["n_columns"], + n_tables=len(gmf_dict["tables"]), + gmf_version=gmf_dict["gmf_version"], program_name=gmf_dict["provenance"]["created by"]["name"], program_version=gmf_dict["provenance"]["created by"]["version"], generation_time=gmf_dict["provenance"]["creation time"], ) - variables = "" - - for var_dict in gmf_dict["vars"]: - var = MetaVar.from_dict(var_dict) - if isinstance(var.distribution, MultinoulliDistribution): - parameters = [f"\t- {label}: {round(prob*gmf_dict['n_rows']*(1-var.prop_missing))}\n" - for label, prob in zip(var.distribution.labels, var.distribution.probs)] - elif isinstance(var.distribution, NADistribution): - variables += (f"### {var.name}\n- Distribution NADistribution\n- Only missing values" - "\n- Examples: NA, NA, NA, ...\n") - continue - else: - parameters = [f"\t - {name}: {value}\n" - for name, value in var.distribution._param_dict().items()] - parameter_str = "".join(parameters) - if var.prop_missing > 0: - examples = np.random.permutation([str(var.distribution.draw()) for _ in range(3)] + - ["NA", "NA"]) - else: - examples = [str(x) for x in var.draw_series(5, None)] - - if "privacy" in var_dict["creation_method"]: - partition_size = var_dict["creation_method"]["privacy"]["parameters"]["partition_size"] - disclosure = f" using micro aggregation with a partition size of {partition_size}" - else: - disclosure = "" - variables += VAR_TEMPLATE.format( - var_name = var.name, - var_type=var.var_type, - class_name=var.distribution.__class__.__name__, - n_based_on=round(gmf_dict["n_rows"]*(1-var.prop_missing)), - example_list=", ".join(examples) + ", ...", - parameters=parameter_str, - missing_perc=f"{100*var.prop_missing:.2f}", - disclosure=disclosure, + md_str = header + if "relations" in gmf_dict and len(gmf_dict["relations"]) > 0: + md_str += "\n## Relations between tables\n\n" + md_str += "\n".join(f"- {rel}" for rel in gmf_dict["relations"]) + md_str += "\n" + for table in gmf_dict["tables"]: + n_rows = table["n_rows"] + n_columns = table["n_columns"] + + table_str = TABLE_TEMPLATE.format( + table_name=table["name"], + n_rows=n_rows, + n_columns=n_columns, ) + + variables = "" + + for var_dict in table["vars"]: + var = MetaVar.from_dict(var_dict) + if isinstance(var.distribution, MultinoulliDistribution): + parameters = [f"\t- {label}: {round(prob*n_rows*(1-var.prop_missing))}\n" + for label, prob in zip(var.distribution.labels, var.distribution.probs)] + elif isinstance(var.distribution, NADistribution): + variables += (f"### {var.name}\n- Distribution NADistribution\n" + "- Only missing values\n" + "- Examples: NA, NA, NA, ...\n") + continue + else: + parameters = [f"\t - {name}: {value}\n" + for name, value in var.distribution._param_dict().items()] + parameter_str = "".join(parameters) + if var.prop_missing > 0: + examples = np.random.permutation([str(var.distribution.draw()) for _ in range(3)] + + ["NA", "NA"]) + else: + examples = [str(x) for x in var.draw_series(5, None)] + + if "privacy" in var_dict["creation_method"]: + partition_size = var_dict["creation_method"]["privacy"]["parameters"][ + "partition_size"] + disclosure = f" using micro aggregation with a partition size of {partition_size}" + else: + disclosure = "" + variables += VAR_TEMPLATE.format( + var_name = var.name, + var_type=var.var_type, + class_name=var.distribution.__class__.__name__, + n_based_on=round(n_rows*(1-var.prop_missing)), + example_list=", ".join(examples) + ", ...", + parameters=parameter_str, + missing_perc=f"{100*var.prop_missing:.2f}", + disclosure=disclosure, + ) + md_str += table_str + variables with open(out_md_file, "w", encoding="utf-8") as handle: - handle.write(header + variables) + handle.write(md_str) def create_input_toml(file_name): diff --git a/metasyn/validation.py b/metasyn/validation.py deleted file mode 100644 index f8559704..00000000 --- a/metasyn/validation.py +++ /dev/null @@ -1,90 +0,0 @@ -"""The validation module contains functions to validate the serialized output of distributions. - -This ensures that the Generative Metadata Format (GMF) files are interoperable and well formed. -""" - -from __future__ import annotations - -from copy import deepcopy -from importlib.metadata import entry_points - -import jsonschema - -from metasyn.distribution.na import NADistribution -from metasyn.registry import DistributionRegistry - -SCHEMA_BASE = { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "http://sodascience.github.io/generative_metadata_format/core/1.1/generative_metadata_format", # noqa: E501 - "type": "object", - "properties": { - "n_rows": {"type": "number"}, - "n_columns": {"type": "number"}, - "provenance": { - "type": "object", - "properties": { - "created by": {"type": "object"}, - "creation time": {"type": "string"} - }, - "required": ["created by", "creation time"] - }, - "vars": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "description": {"type": "string"}, - "type": {"enum": ["discrete", "continuous", "string", "categorical", "date", - "datetime", "time"]}, - "dtype": {"type": "string"}, - "prop_missing": {"type": "number"}, - "distribution": { - "$ref": "#/$defs/all_dist_def" - } - } - }, - "required": ["name", "type", "dtype", "provenance", "prop_missing", "distribution"] - } - }, - "required": ["n_rows", "n_columns", "vars"], -} - - -def validate_gmf_dict(gmf_dict: dict): - """Validate a JSON dictionary of a metaframe as it would be written to a GMF file. - - Make sure that you have used the _jsonify function to convert numpy arrays to - lists, etc. - - Arguments - --------- - gmf_dict: - Dictionary containing the metasyn output for a metaframe. - """ - packages = [entry.name for entry in entry_points(group="metasyn.distribution_registry")] - schema = create_schema(packages) - jsonschema.validate(gmf_dict, schema) - - -def create_schema(packages: list[str]) -> dict: - """Create JSON Schema to validate a GMF file. - - Arguments - --------- - packages: - List of packages to create the schema with. - - Returns - ------- - schema: - Schema containing all the distributions in the distribution packages. - """ - defs: list[dict] = [] - for fitter in DistributionRegistry.parse(packages).fitters: - defs.append(fitter.distribution.schema()) - defs.append(NADistribution.schema()) - - schema = deepcopy(SCHEMA_BASE) - schema.update({"$defs": {"all_dist_def": {"anyOf": defs}}}) - return schema diff --git a/metasyn/varspec.py b/metasyn/varspec.py index fa083f3b..df271370 100644 --- a/metasyn/varspec.py +++ b/metasyn/varspec.py @@ -1,6 +1,8 @@ """Module for distribution and variable specifications.""" from __future__ import annotations +import inspect + # from metasyn.util import VarSpec from dataclasses import dataclass, field from typing import Any, Optional, Union @@ -78,8 +80,12 @@ def parse(cls, dist_spec: Optional[Union[dict, type[BaseDistribution], BaseDistr return cls(**dist_spec) if isinstance(dist_spec, DistributionSpec): return dist_spec - if issubclass(dist_spec, BaseDistribution): + if inspect.isclass(dist_spec) and issubclass(dist_spec, BaseDistribution): return cls(name=dist_spec.name, unique=dist_spec.unique) + if (isinstance(dist_spec, BaseFitter) + or (inspect.isclass(dist_spec) and issubclass(dist_spec, BaseFitter))): + raise TypeError(f"Supplied Fitter {dist_spec}, but you should supply a distribution " + "for parsing.") raise TypeError("Error parsing distribution specification of unknown type " f"'{type(dist_spec)}' with value '{dist_spec}'") diff --git a/pyproject.toml b/pyproject.toml index 5f927212..ffd2c290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools==75.8.0", "setuptools-scm[toml]>=6.2"] # metadata-version = "2.2" for pypi +requires = ["setuptools>=77.0.3", "setuptools-scm[toml]>=6.2"] # metadata-version = "2.2" for pypi build-backend = "setuptools.build_meta" [project] @@ -12,7 +12,8 @@ description = "Package for creating synthetic datasets while preserving privacy. readme = "README.md" requires-python = ">=3.10" keywords = ["metadata", "open-data", "privacy", "synthetic-data", "tabular datasets"] -license = { file = "LICENSE" } +license = "MIT" +license-files = ["LICENSE"] classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", @@ -20,7 +21,6 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Development Status :: 5 - Production/Stable", - "License :: OSI Approved :: MIT License", ] dependencies = [ diff --git a/tests/data/example_gmf_simple_v11.json b/tests/data/example_gmf_simple_v11.json new file mode 100644 index 00000000..ee7cbd6a --- /dev/null +++ b/tests/data/example_gmf_simple_v11.json @@ -0,0 +1,177 @@ +{ + "gmf_version": "1.1", + "n_rows": 5, + "n_columns": 5, + "provenance": { + "created by": { + "name": "metasyn", + "version": "1.1.1.dev33+g0df642883" + }, + "creation time": "2025-10-15T11:10:30.997379" + }, + "vars": [ + { + "name": "ID", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.0, + "distribution": { + "name": "core.unique_key", + "version": "1.0", + "class_name": "UniqueKeyDistribution", + "unique": true, + "parameters": { + "lower": 1, + "consecutive": true + } + }, + "creation_method": { + "created_by": "metasyn", + "distribution": { + "unique": true + }, + "fitter": { + "name": "UniqueKeyFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev33+g0df642883" + } + } + }, + { + "name": "fruits", + "type": "categorical", + "dtype": "Categorical", + "prop_missing": 0.0, + "distribution": { + "name": "core.multinoulli", + "version": "1.0", + "class_name": "MultinoulliDistribution", + "unique": false, + "parameters": { + "labels": [ + "apple", + "banana" + ], + "probs": [ + 0.4, + 0.6 + ] + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "MultinoulliFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev33+g0df642883" + } + } + }, + { + "name": "B", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.0, + "distribution": { + "name": "core.uniform", + "version": "1.0", + "class_name": "DiscreteUniformDistribution", + "unique": false, + "parameters": { + "lower": 1, + "upper": 6 + } + }, + "creation_method": { + "created_by": "metasyn", + "distribution": { + "unique": false + }, + "fitter": { + "name": "DiscreteUniformFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev33+g0df642883" + } + } + }, + { + "name": "cars", + "type": "categorical", + "dtype": "Categorical", + "prop_missing": 0.0, + "distribution": { + "name": "core.multinoulli", + "version": "1.0", + "class_name": "MultinoulliDistribution", + "unique": false, + "parameters": { + "labels": [ + "audi", + "beetle" + ], + "probs": [ + 0.2, + 0.8 + ] + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "MultinoulliFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev33+g0df642883" + } + } + }, + { + "name": "optional", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.2, + "distribution": { + "name": "core.uniform", + "version": "1.0", + "class_name": "DiscreteUniformDistribution", + "unique": false, + "parameters": { + "lower": -30, + "upper": 301 + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "DiscreteUniformFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev33+g0df642883" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/data/example_gmf_titanic_v11.json b/tests/data/example_gmf_titanic_v11.json new file mode 100644 index 00000000..5528ff9d --- /dev/null +++ b/tests/data/example_gmf_titanic_v11.json @@ -0,0 +1,477 @@ +{ + "gmf_version": "1.1", + "n_rows": 891, + "n_columns": 13, + "provenance": { + "created by": { + "name": "metasyn", + "version": "1.1.1.dev42+g2631ec541" + }, + "creation time": "2025-10-16T12:32:14.437337" + }, + "file_format": { + "file_interface_name": "csv", + "format_metadata": { + "separator": ",", + "line_terminator": "\n", + "quote_char": "\"", + "null_value": "", + "encoding": "utf-8" + }, + "file_name": "demo_titanic.csv" + }, + "vars": [ + { + "name": "PassengerId", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.0, + "distribution": { + "name": "core.unique_key", + "version": "1.0", + "class_name": "UniqueKeyDistribution", + "unique": true, + "parameters": { + "lower": 1, + "consecutive": true + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "UniqueKeyFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Name", + "type": "string", + "dtype": "String", + "prop_missing": 0.0, + "distribution": { + "name": "core.freetext", + "version": "1.0", + "class_name": "FreeTextDistribution", + "unique": false, + "parameters": { + "locale": "EN", + "avg_sentences": 2.4691358024691357, + "avg_words": 4.093153759820426 + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "FreeTextFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Sex", + "type": "categorical", + "dtype": "Categorical", + "prop_missing": 0.0, + "distribution": { + "name": "core.multinoulli", + "version": "1.0", + "class_name": "MultinoulliDistribution", + "unique": false, + "parameters": { + "labels": [ + "female", + "male" + ], + "probs": [ + 0.35241301907968575, + 0.6475869809203143 + ] + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "MultinoulliFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Age", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.19865319865319866, + "distribution": { + "name": "core.truncated_normal", + "version": "1.0", + "class_name": "DiscreteTruncatedNormalDistribution", + "unique": false, + "parameters": { + "lower": -1e-08, + "upper": 80.00000001, + "mean": 28.403638823278087, + "sd": 15.862325051407092 + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "DiscreteTruncatedNormalFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Parch", + "type": "discrete", + "dtype": "Int64", + "prop_missing": 0.0, + "distribution": { + "name": "core.truncated_normal", + "version": "1.0", + "class_name": "DiscreteTruncatedNormalDistribution", + "unique": false, + "parameters": { + "lower": -1e-08, + "upper": 6.00000001, + "mean": -247.46471779076856, + "sd": 9.72548403502272 + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "DiscreteTruncatedNormalFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Ticket", + "type": "string", + "dtype": "String", + "prop_missing": 0.0, + "distribution": { + "name": "core.regex", + "version": "2.0", + "class_name": "RegexDistribution", + "unique": false, + "parameters": { + "regex_data": { + "regex": "(|[A-Z]{1,4}(|(\\.A\\.|/[0-9A-Z])) )[0-9]{4,6}", + "counts": [ + [ + [ + 643 + ], + [ + 46, + [ + [ + 75 + ], + [ + [ + [ + 27, + 27, + 27, + 27 + ], + [ + 19, + 19, + 19 + ] + ], + 0 + ] + ], + 75, + 75 + ] + ], + 643, + 643 + ] + } + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "RegexFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "2.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Fare", + "type": "continuous", + "dtype": "Float64", + "prop_missing": 0.0, + "distribution": { + "name": "core.exponential", + "version": "1.0", + "class_name": "ExponentialDistribution", + "unique": false, + "parameters": { + "rate": 0.03052908440177665 + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "ExponentialFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Cabin", + "type": "string", + "dtype": "String", + "prop_missing": 0.7710437710437711, + "distribution": { + "name": "core.regex", + "version": "2.0", + "class_name": "RegexDistribution", + "unique": false, + "parameters": { + "regex_data": { + "regex": "[A-G][0-9 ]{1,3}(|[B-C][0-9]{2}(| [B-C][0-9]{2})|[A-Z][0-9]{2})", + "counts": [ + 198, + 198, + [ + [ + 176 + ], + [ + 17, + 17, + [ + [ + 11 + ], + [ + 6, + 6, + 6, + 6 + ] + ], + 0 + ], + [ + 5, + 5, + 5 + ] + ], + 0 + ] + } + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "RegexFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "2.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Embarked", + "type": "categorical", + "dtype": "Categorical", + "prop_missing": 0.002244668911335578, + "distribution": { + "name": "core.multinoulli", + "version": "1.0", + "class_name": "MultinoulliDistribution", + "unique": false, + "parameters": { + "labels": [ + "C", + "Q", + "S" + ], + "probs": [ + 0.1889763779527559, + 0.08661417322834646, + 0.7244094488188977 + ] + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "MultinoulliFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Birthday", + "type": "date", + "dtype": "Date", + "prop_missing": 0.08754208754208755, + "distribution": { + "name": "core.uniform", + "version": "1.0", + "class_name": "DateUniformDistribution", + "unique": false, + "parameters": { + "lower": "1903-07-28", + "upper": "1940-05-27" + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "DateUniformFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Board time", + "type": "time", + "dtype": "Time", + "prop_missing": 0.08866442199775533, + "distribution": { + "name": "core.uniform", + "version": "1.0", + "class_name": "TimeUniformDistribution", + "unique": false, + "parameters": { + "lower": "10:39:40", + "upper": "18:39:28", + "precision": "seconds" + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "TimeUniformFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "Married since", + "type": "datetime", + "dtype": "Datetime(time_unit='us', time_zone=None)", + "prop_missing": 0.10325476992143659, + "distribution": { + "name": "core.uniform", + "version": "1.0", + "class_name": "DateTimeUniformDistribution", + "unique": false, + "parameters": { + "lower": "2022-07-15T12:21:15", + "upper": "2022-08-15T10:32:05", + "precision": "seconds" + } + }, + "creation_method": { + "created_by": "metasyn", + "fitter": { + "name": "DateTimeUniformFitter", + "privacy": { + "name": "none", + "parameters": {} + }, + "version": "1.0", + "plugin": "builtin", + "plugin_version": "1.1.1.dev42+g2631ec541" + } + } + }, + { + "name": "all_NA", + "type": "string", + "dtype": "String", + "prop_missing": 1.0, + "distribution": { + "name": "core.na", + "version": "1.0", + "class_name": "NADistribution", + "unique": false, + "parameters": {} + }, + "creation_method": { + "created_by": "metasyn" + } + } + ] +} \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index 65396973..9e4826fc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -11,7 +11,7 @@ from metasyn import MetaFrame from metasyn.__main__ import main from metasyn.file import _AVAILABLE_FILE_INTERFACES -from metasyn.validation import validate_gmf_dict +from metasyn.gmf import validate_gmf_dict TMP_DIR_PATH = None diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 4bb134dd..8dc391ad 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -7,8 +7,15 @@ import pytest from pytest import mark -from metasyn.demo.dataset import _AVAILABLE_DATASETS, _get_demo_class, demo_dataframe, demo_file +from metasyn.demo.dataset import ( + _AVAILABLE_DATASETS, + BaseMultiDataset, + _get_demo_class, + demo_file, + demo_data, +) from metasyn.metaframe import MetaFrame +from metasyn.multiframe import MultiFrame from metasyn.privacy import BasicPrivacy from metasyn.registry import DistributionRegistry from metasyn.var import MetaVar @@ -130,14 +137,16 @@ def test_distributions(tmp_path): dataset.save_json(tmp_fp) @mark.parametrize( - "dataset_name", list(_AVAILABLE_DATASETS) + "dataset_name", list(_AVAILABLE_DATASETS), ) def test_demo_datasets(tmp_path, dataset_name): """Test all built-in demo datasets and see if they can be synthesized.""" demo_fp = demo_file(dataset_name) - demo_df = demo_dataframe(dataset_name) + demo_df = demo_data(dataset_name) demo_class = _get_demo_class(dataset_name) - + print(demo_class) + if isinstance(demo_class, BaseMultiDataset): + return assert demo_fp.is_file() assert isinstance(demo_df, pl.DataFrame) @@ -157,6 +166,18 @@ def test_demo_datasets(tmp_path, dataset_name): for col, dtype in demo_class.schema.items(): assert dtype == df_syn[col].dtype +def test_demo_multi(tmp_path): + dfs = demo_data("shop_multi") + demo_class = _get_demo_class("shop_multi") + assert isinstance(demo_file("shop_multi"), dict) + assert isinstance(dfs, dict) + demo_class.create(tmp_path) + for fp in demo_class.file_location.values(): + assert fp.is_file() + mf = MultiFrame.fit_dataframes(dfs, + relations = ["customers[id] SUBSET OF purchases[customer_id]"]) + dfs = mf.synthesize() + assert isinstance(dfs, dict) def test_demo_non_exist(): """Check that trying to get a demo dataset that doesn't exist raises an error.""" diff --git a/tests/test_file.py b/tests/test_file.py index 990b84ec..6a9b4afa 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -5,7 +5,7 @@ from pytest import mark import metasyn as ms -from metasyn.demo.dataset import _AVAILABLE_DATASETS, demo_dataframe, demo_file +from metasyn.demo.dataset import _AVAILABLE_DATASETS, demo_data, demo_file from metasyn.file import ( _AVAILABLE_FILE_INTERFACES, BaseFileInterface, @@ -127,17 +127,27 @@ def test_sav_interface(filename, tmpdir): assert new_df.columns == df.columns -def test_prs_chunking(): +@mark.parametrize( + "max_rows,chunk_size,length", + [ + (None, None, 810), + (10, None, 10), + (20, 2, 20), + (19, 2, 19), + (800, 33, 800), + (800, 650, 800), + (810, 405, 810), + (810, 23, 810), + (1230, 10, 810), + ]) +def test_prs_chunking(max_rows, chunk_size, length): sav_fp = Path("tests", "data", "GlastonburyFestival.sav") - df, _ = ms.read_sav(sav_fp) - assert len(df) == 810 - df, _ = ms.read_sav(sav_fp, max_rows=10) - assert len(df) == 10 - df, _ = ms.read_sav(sav_fp, max_rows=20, chunk_size=2) - assert len(df) == 20 + df, _ = ms.read_sav(sav_fp, max_rows=max_rows, chunk_size=chunk_size) + assert len(df) == length + @mark.parametrize("dataset_name", - _AVAILABLE_DATASETS) + [x for x in _AVAILABLE_DATASETS if x != "shop_multi"]) def test_csv_interface(dataset_name, tmpdir): filename = demo_file(dataset_name) direct_df, _ = CsvFileInterface.read_file(filename) @@ -194,7 +204,7 @@ def test_file_interface_errors(): @mark.parametrize("interface_class", [x for x in _AVAILABLE_FILE_INTERFACES.values() if not x.__name__.startswith("Bad")]) def test_default_file_interfaces(interface_class, tmpdir): - df = demo_dataframe("test") + df = demo_data("test") suffix = interface_class.extensions[0] fp = Path(tmpdir/f"test_file{suffix}") interface_class.default_interface(fp).write_file(df, fp) @@ -210,7 +220,7 @@ def test_default_file_interfaces(interface_class, tmpdir): assert df_new.shape == df.shape def test_stata(tmpdir): - df = demo_dataframe("test") + df = demo_data("test") file_out = tmpdir / "test.dta" ms.write_dta(df, file_out) # StataFileInterface.default_interface(file_out).write_file(df, file_out) diff --git a/tests/test_multiframe.py b/tests/test_multiframe.py new file mode 100644 index 00000000..5ba74b05 --- /dev/null +++ b/tests/test_multiframe.py @@ -0,0 +1,148 @@ +import numpy as np +import polars as pl +import pytest +from pytest import mark + +from metasyn.metaframe import MetaFrame +from metasyn.multiframe import ColumnRelation, MultiFrame, RelationType + + +@pytest.fixture() +def mock_data(): + id_a = np.unique(np.random.randint(0, 1000, size=100))[:50] + id_a_shuffled = np.copy(id_a) + np.random.shuffle(id_a_shuffled) + id_a_chosen = np.random.choice(id_a, replace=True) + id_b = np.unique(np.random.randint(2000, 3000, size=100))[:50] + + return pl.DataFrame({"id": id_a, "id_shuffled": id_a_shuffled, "id_chosen": id_a_chosen, + "unrelated": id_b}) + +@pytest.fixture() +def mock_multi_frame(mock_data): + mf1 = MetaFrame.fit_dataframe(mock_data) + mf2 = MetaFrame.fit_dataframe(mock_data) + mfs = {"df_a": mf1, "b": mf2} + multi_frame = MultiFrame(mfs, ["df_a[id] EQUALS b[id]"]) + return multi_frame + +def test_print_metaframe(mock_multi_frame): + mf_str = str(mock_multi_frame) + assert "id" in mf_str + assert "df_a" in mf_str + assert "4" in mf_str # Number of columns + +def test_multi_getitem(mock_multi_frame): + assert isinstance(mock_multi_frame["df_a"], MetaFrame) + +@mark.parametrize("obj,symbol,obj_str", [ + (RelationType.Subset, "SUBSET OF", "SUBSET OF"), + (RelationType.Equal, "EQUALS", "EQUALS"), + (RelationType.EqualOrdered, "EQUAL ORDERED", "EQUAL ORDERED"), +]) +def test_rel_type_parse(obj, symbol, obj_str): + assert obj == RelationType.parse(symbol) + assert obj == RelationType(obj_str) + +def test_rel_type_error(): + with pytest.raises(ValueError): + RelationType.parse("a") + with pytest.raises(ValueError): + RelationType.parse("??") + +@mark.parametrize("rel_str,expected", [ + ("a[b] SUBSET OF c[d]", ("a", "b", "c", "d", RelationType.Subset)), + (r"a[\[\]] EQUALS c[()]", ("a", "[]", "c", "()", RelationType.Equal)), + (" a[ b ] EQUAL ORDERED c[d]", (" a", " b ", " c", "d", RelationType.EqualOrdered)), + ("a[\nb] INFER FROM c[d]", ("a", "\nb", "c", "d", RelationType.Infer)), + ("a[b]<