feat: add component for checking all the data registry publications

yolile · yolile · commit 6fb17963c135 · 2024-02-28T19:09:14.000-03:00
diff --git a/component_check_relevant_all_registry.ipynb b/component_check_relevant_all_registry.ipynb
@@ -3,9 +3,9 @@
     {
       "cell_type": "markdown",
       "source": [
-        "## Check which publications from the Data Registry passes or not the Relevant criterion\n",
+        "## Check the MVP status of the Data registry publications\n",
         "\n",
-        "Use this notebook to check which publications in the Data Registry passes the Relevant criterion"
+        "Use this notebook to check which publications in the Data Registry pass the MVP Relevant and Active criteria, for example, for updating the MEL1 tracker upon OCP Rapid Reflection meetings."
       ],
       "metadata": {
         "collapsed": false
@@ -15,7 +15,120 @@
       "cell_type": "code",
       "execution_count": null,
       "outputs": [],
-      "source": [],
+      "source": [
+        "# @title Get all the publications from the registry { display-mode: \"form\" }\n",
+        "\n",
+        "publications = get_publications()"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Check for non-frozen publications whose latest data has not been updated in the previous four calendar quarters\n",
+        "\n",
+        "From the list, check also the \"last_retrieved\" and \"update_frequency\" columns. If the data is not being retrieved, check the publication log in the Data Registry to check if there is a problem with either a job or the source data itself."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "non_frozen_publications = list(filter(lambda item: not item[\"frozen\"] and item[\"date_to\"], publications))\n",
+        "past_year = datetime.now() - relativedelta(years=1)\n",
+        "lapsed_publications = list(\n",
+        "    filter(lambda item: datetime.strptime(item[\"date_to\"], \"%Y-%m-%d\") < past_year, non_frozen_publications)\n",
+        ")\n",
+        "lapsed_publications_table = pd.DataFrame(lapsed_publications)\n",
+        "lapsed_publications_table"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Check non-relevant publications\n",
+        "Check which active publications pass and not pass the \"Relevant\" criterion."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "results = []\n",
+        "active_publications = [item for item in non_frozen_publications if item not in lapsed_publications]\n",
+        "for publication in active_publications:\n",
+        "    year = publication[\"date_to\"][:4]\n",
+        "    if int(year) > datetime.now().year:\n",
+        "        year = datetime.now().year\n",
+        "    file_name = download_file(publication, year)\n",
+        "    field_table = cardinal_calculate_coverage(file_name)\n",
+        "    fields_list = field_table.iloc[:, 0].tolist()\n",
+        "    relevant, relevant_table = is_relevant(fields_list)\n",
+        "    relevant_table[\"publisher\"] = publication[\"label\"]\n",
+        "    relevant_table[\"relevant\"] = relevant\n",
+        "    results.append(relevant_table)"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Filter the non-relevant ones"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "result = pd.concat(results)\n",
+        "not_relevant_publishers = result[~result[\"relevant\"]]\n",
+        "non_relevant_rules = (\n",
+        "    not_relevant_publishers[not_relevant_publishers[\"possible_to_calculate\"] == \"No\"]\n",
+        "    .groupby(\"publisher\")\n",
+        "    .apply(lambda x: \", \".join(x[\"rule\"].astype(str) + \": \" + x[\"missing_fields\"].astype(str)))\n",
+        "    .reset_index()\n",
+        "    .rename(columns={0: \"failed rules\"})\n",
+        ")"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Check the results"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "non_relevant_rules"
+      ],
       "metadata": {
         "collapsed": false
       }
diff --git a/component_environment.ipynb b/component_environment.ipynb
@@ -56,6 +56,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_basic_criteria_checks.ipynb b/template_basic_criteria_checks.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_data_quality_feedback.ipynb b/template_data_quality_feedback.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_meta_analysis.ipynb b/template_meta_analysis.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_publisher_analysis.ipynb b/template_publisher_analysis.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_relevant_checks_fieldlist.ipynb b/template_relevant_checks_fieldlist.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_relevant_checks_registry.ipynb b/template_relevant_checks_registry.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_relevant_checks_registry_all.ipynb b/template_relevant_checks_registry_all.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
@@ -870,9 +872,9 @@
         "collapsed": false
       },
       "source": [
-        "## Check which publications from the Data Registry passes or not the Relevant criterion\n",
+        "## Check the MVP status of the Data registry publications\n",
         "\n",
-        "Use this notebook to check which publications in the Data Registry passes the Relevant criterion"
+        "Use this notebook to check which publications in the Data Registry pass the MVP Relevant and Active criteria, for example, for updating the MEL1 tracker upon OCP Rapid Reflection meetings."
       ]
     },
     {
@@ -882,7 +884,120 @@
         "collapsed": false
       },
       "outputs": [],
-      "source": []
+      "source": [
+        "# @title Get all the publications from the registry { display-mode: \"form\" }\n",
+        "\n",
+        "publications = get_publications()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false
+      },
+      "source": [
+        "### Check for non-frozen publications whose latest data has not been updated in the previous four calendar quarters\n",
+        "\n",
+        "From the list, check also the \"last_retrieved\" and \"update_frequency\" columns. If the data is not being retrieved, check the publication log in the Data Registry to check if there is a problem with either a job or the source data itself."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "non_frozen_publications = list(filter(lambda item: not item[\"frozen\"] and item[\"date_to\"], publications))\n",
+        "past_year = datetime.now() - relativedelta(years=1)\n",
+        "lapsed_publications = list(\n",
+        "    filter(lambda item: datetime.strptime(item[\"date_to\"], \"%Y-%m-%d\") < past_year, non_frozen_publications)\n",
+        ")\n",
+        "lapsed_publications_table = pd.DataFrame(lapsed_publications)\n",
+        "lapsed_publications_table"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false
+      },
+      "source": [
+        "### Check non-relevant publications\n",
+        "Check which active publications pass and not pass the \"Relevant\" criterion."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "results = []\n",
+        "active_publications = [item for item in non_frozen_publications if item not in lapsed_publications]\n",
+        "for publication in active_publications:\n",
+        "    year = publication[\"date_to\"][:4]\n",
+        "    if int(year) > datetime.now().year:\n",
+        "        year = datetime.now().year\n",
+        "    file_name = download_file(publication, year)\n",
+        "    field_table = cardinal_calculate_coverage(file_name)\n",
+        "    fields_list = field_table.iloc[:, 0].tolist()\n",
+        "    relevant, relevant_table = is_relevant(fields_list)\n",
+        "    relevant_table[\"publisher\"] = publication[\"label\"]\n",
+        "    relevant_table[\"relevant\"] = relevant\n",
+        "    results.append(relevant_table)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false
+      },
+      "source": [
+        "Filter the non-relevant ones"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "result = pd.concat(results)\n",
+        "not_relevant_publishers = result[~result[\"relevant\"]]\n",
+        "non_relevant_rules = (\n",
+        "    not_relevant_publishers[not_relevant_publishers[\"possible_to_calculate\"] == \"No\"]\n",
+        "    .groupby(\"publisher\")\n",
+        "    .apply(lambda x: \", \".join(x[\"rule\"].astype(str) + \": \" + x[\"missing_fields\"].astype(str)))\n",
+        "    .reset_index()\n",
+        "    .rename(columns={0: \"failed rules\"})\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false
+      },
+      "source": [
+        "Check the results"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "non_relevant_rules"
+      ]
     }
   ],
   "metadata": {
diff --git a/template_structure_and_format_feedback.ipynb b/template_structure_and_format_feedback.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_usability_checks.ipynb b/template_usability_checks.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_usability_checks_fieldlist.ipynb b/template_usability_checks_fieldlist.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",
diff --git a/template_usability_checks_registry.ipynb b/template_usability_checks_registry.ipynb
@@ -49,6 +49,8 @@
         "import tempfile\n",
         "from collections import Counter\n",
         "\n",
+        "from datetime import datetime\n",
+        "from dateutil.relativedelta import relativedelta\n",
         "import numpy as np\n",
         "import pandas as pd\n",
         "from google.colab.data_table import DataTable\n",