Skip to content

Commit 6fb1796

Browse files
committed
feat: add component for checking all the data registry publications
1 parent 893deb6 commit 6fb1796

13 files changed

+256
-6
lines changed

component_check_relevant_all_registry.ipynb

+116-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
{
44
"cell_type": "markdown",
55
"source": [
6-
"## Check which publications from the Data Registry passes or not the Relevant criterion\n",
6+
"## Check the MVP status of the Data registry publications\n",
77
"\n",
8-
"Use this notebook to check which publications in the Data Registry passes the Relevant criterion"
8+
"Use this notebook to check which publications in the Data Registry pass the MVP Relevant and Active criteria, for example, for updating the MEL1 tracker upon OCP Rapid Reflection meetings."
99
],
1010
"metadata": {
1111
"collapsed": false
@@ -15,7 +15,120 @@
1515
"cell_type": "code",
1616
"execution_count": null,
1717
"outputs": [],
18-
"source": [],
18+
"source": [
19+
"# @title Get all the publications from the registry { display-mode: \"form\" }\n",
20+
"\n",
21+
"publications = get_publications()"
22+
],
23+
"metadata": {
24+
"collapsed": false
25+
}
26+
},
27+
{
28+
"cell_type": "markdown",
29+
"source": [
30+
"### Check for non-frozen publications whose latest data has not been updated in the previous four calendar quarters\n",
31+
"\n",
32+
"From the list, check also the \"last_retrieved\" and \"update_frequency\" columns. If the data is not being retrieved, check the publication log in the Data Registry to check if there is a problem with either a job or the source data itself."
33+
],
34+
"metadata": {
35+
"collapsed": false
36+
}
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": null,
41+
"outputs": [],
42+
"source": [
43+
"non_frozen_publications = list(filter(lambda item: not item[\"frozen\"] and item[\"date_to\"], publications))\n",
44+
"past_year = datetime.now() - relativedelta(years=1)\n",
45+
"lapsed_publications = list(\n",
46+
" filter(lambda item: datetime.strptime(item[\"date_to\"], \"%Y-%m-%d\") < past_year, non_frozen_publications)\n",
47+
")\n",
48+
"lapsed_publications_table = pd.DataFrame(lapsed_publications)\n",
49+
"lapsed_publications_table"
50+
],
51+
"metadata": {
52+
"collapsed": false
53+
}
54+
},
55+
{
56+
"cell_type": "markdown",
57+
"source": [
58+
"### Check non-relevant publications\n",
59+
"Check which active publications pass and not pass the \"Relevant\" criterion."
60+
],
61+
"metadata": {
62+
"collapsed": false
63+
}
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"outputs": [],
69+
"source": [
70+
"results = []\n",
71+
"active_publications = [item for item in non_frozen_publications if item not in lapsed_publications]\n",
72+
"for publication in active_publications:\n",
73+
" year = publication[\"date_to\"][:4]\n",
74+
" if int(year) > datetime.now().year:\n",
75+
" year = datetime.now().year\n",
76+
" file_name = download_file(publication, year)\n",
77+
" field_table = cardinal_calculate_coverage(file_name)\n",
78+
" fields_list = field_table.iloc[:, 0].tolist()\n",
79+
" relevant, relevant_table = is_relevant(fields_list)\n",
80+
" relevant_table[\"publisher\"] = publication[\"label\"]\n",
81+
" relevant_table[\"relevant\"] = relevant\n",
82+
" results.append(relevant_table)"
83+
],
84+
"metadata": {
85+
"collapsed": false
86+
}
87+
},
88+
{
89+
"cell_type": "markdown",
90+
"source": [
91+
"Filter the non-relevant ones"
92+
],
93+
"metadata": {
94+
"collapsed": false
95+
}
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"outputs": [],
101+
"source": [
102+
"result = pd.concat(results)\n",
103+
"not_relevant_publishers = result[~result[\"relevant\"]]\n",
104+
"non_relevant_rules = (\n",
105+
" not_relevant_publishers[not_relevant_publishers[\"possible_to_calculate\"] == \"No\"]\n",
106+
" .groupby(\"publisher\")\n",
107+
" .apply(lambda x: \", \".join(x[\"rule\"].astype(str) + \": \" + x[\"missing_fields\"].astype(str)))\n",
108+
" .reset_index()\n",
109+
" .rename(columns={0: \"failed rules\"})\n",
110+
")"
111+
],
112+
"metadata": {
113+
"collapsed": false
114+
}
115+
},
116+
{
117+
"cell_type": "markdown",
118+
"source": [
119+
"Check the results"
120+
],
121+
"metadata": {
122+
"collapsed": false
123+
}
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": null,
128+
"outputs": [],
129+
"source": [
130+
"non_relevant_rules"
131+
],
19132
"metadata": {
20133
"collapsed": false
21134
}

component_environment.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
"import tempfile\n",
5757
"from collections import Counter\n",
5858
"\n",
59+
"from datetime import datetime\n",
60+
"from dateutil.relativedelta import relativedelta\n",
5961
"import numpy as np\n",
6062
"import pandas as pd\n",
6163
"from google.colab.data_table import DataTable\n",

template_basic_criteria_checks.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_data_quality_feedback.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_meta_analysis.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_publisher_analysis.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_relevant_checks_fieldlist.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_relevant_checks_registry.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_relevant_checks_registry_all.ipynb

+118-3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",
@@ -870,9 +872,9 @@
870872
"collapsed": false
871873
},
872874
"source": [
873-
"## Check which publications from the Data Registry passes or not the Relevant criterion\n",
875+
"## Check the MVP status of the Data registry publications\n",
874876
"\n",
875-
"Use this notebook to check which publications in the Data Registry passes the Relevant criterion"
877+
"Use this notebook to check which publications in the Data Registry pass the MVP Relevant and Active criteria, for example, for updating the MEL1 tracker upon OCP Rapid Reflection meetings."
876878
]
877879
},
878880
{
@@ -882,7 +884,120 @@
882884
"collapsed": false
883885
},
884886
"outputs": [],
885-
"source": []
887+
"source": [
888+
"# @title Get all the publications from the registry { display-mode: \"form\" }\n",
889+
"\n",
890+
"publications = get_publications()"
891+
]
892+
},
893+
{
894+
"cell_type": "markdown",
895+
"metadata": {
896+
"collapsed": false
897+
},
898+
"source": [
899+
"### Check for non-frozen publications whose latest data has not been updated in the previous four calendar quarters\n",
900+
"\n",
901+
"From the list, check also the \"last_retrieved\" and \"update_frequency\" columns. If the data is not being retrieved, check the publication log in the Data Registry to check if there is a problem with either a job or the source data itself."
902+
]
903+
},
904+
{
905+
"cell_type": "code",
906+
"execution_count": null,
907+
"metadata": {
908+
"collapsed": false
909+
},
910+
"outputs": [],
911+
"source": [
912+
"non_frozen_publications = list(filter(lambda item: not item[\"frozen\"] and item[\"date_to\"], publications))\n",
913+
"past_year = datetime.now() - relativedelta(years=1)\n",
914+
"lapsed_publications = list(\n",
915+
" filter(lambda item: datetime.strptime(item[\"date_to\"], \"%Y-%m-%d\") < past_year, non_frozen_publications)\n",
916+
")\n",
917+
"lapsed_publications_table = pd.DataFrame(lapsed_publications)\n",
918+
"lapsed_publications_table"
919+
]
920+
},
921+
{
922+
"cell_type": "markdown",
923+
"metadata": {
924+
"collapsed": false
925+
},
926+
"source": [
927+
"### Check non-relevant publications\n",
928+
"Check which active publications pass and not pass the \"Relevant\" criterion."
929+
]
930+
},
931+
{
932+
"cell_type": "code",
933+
"execution_count": null,
934+
"metadata": {
935+
"collapsed": false
936+
},
937+
"outputs": [],
938+
"source": [
939+
"results = []\n",
940+
"active_publications = [item for item in non_frozen_publications if item not in lapsed_publications]\n",
941+
"for publication in active_publications:\n",
942+
" year = publication[\"date_to\"][:4]\n",
943+
" if int(year) > datetime.now().year:\n",
944+
" year = datetime.now().year\n",
945+
" file_name = download_file(publication, year)\n",
946+
" field_table = cardinal_calculate_coverage(file_name)\n",
947+
" fields_list = field_table.iloc[:, 0].tolist()\n",
948+
" relevant, relevant_table = is_relevant(fields_list)\n",
949+
" relevant_table[\"publisher\"] = publication[\"label\"]\n",
950+
" relevant_table[\"relevant\"] = relevant\n",
951+
" results.append(relevant_table)"
952+
]
953+
},
954+
{
955+
"cell_type": "markdown",
956+
"metadata": {
957+
"collapsed": false
958+
},
959+
"source": [
960+
"Filter the non-relevant ones"
961+
]
962+
},
963+
{
964+
"cell_type": "code",
965+
"execution_count": null,
966+
"metadata": {
967+
"collapsed": false
968+
},
969+
"outputs": [],
970+
"source": [
971+
"result = pd.concat(results)\n",
972+
"not_relevant_publishers = result[~result[\"relevant\"]]\n",
973+
"non_relevant_rules = (\n",
974+
" not_relevant_publishers[not_relevant_publishers[\"possible_to_calculate\"] == \"No\"]\n",
975+
" .groupby(\"publisher\")\n",
976+
" .apply(lambda x: \", \".join(x[\"rule\"].astype(str) + \": \" + x[\"missing_fields\"].astype(str)))\n",
977+
" .reset_index()\n",
978+
" .rename(columns={0: \"failed rules\"})\n",
979+
")"
980+
]
981+
},
982+
{
983+
"cell_type": "markdown",
984+
"metadata": {
985+
"collapsed": false
986+
},
987+
"source": [
988+
"Check the results"
989+
]
990+
},
991+
{
992+
"cell_type": "code",
993+
"execution_count": null,
994+
"metadata": {
995+
"collapsed": false
996+
},
997+
"outputs": [],
998+
"source": [
999+
"non_relevant_rules"
1000+
]
8861001
}
8871002
],
8881003
"metadata": {

template_structure_and_format_feedback.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_usability_checks.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_usability_checks_fieldlist.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

template_usability_checks_registry.ipynb

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"import tempfile\n",
5050
"from collections import Counter\n",
5151
"\n",
52+
"from datetime import datetime\n",
53+
"from dateutil.relativedelta import relativedelta\n",
5254
"import numpy as np\n",
5355
"import pandas as pd\n",
5456
"from google.colab.data_table import DataTable\n",

0 commit comments

Comments
 (0)