|
49 | 49 | "import tempfile\n",
|
50 | 50 | "from collections import Counter\n",
|
51 | 51 | "\n",
|
| 52 | + "from datetime import datetime\n", |
| 53 | + "from dateutil.relativedelta import relativedelta\n", |
52 | 54 | "import numpy as np\n",
|
53 | 55 | "import pandas as pd\n",
|
54 | 56 | "from google.colab.data_table import DataTable\n",
|
|
870 | 872 | "collapsed": false
|
871 | 873 | },
|
872 | 874 | "source": [
|
873 |
| - "## Check which publications from the Data Registry passes or not the Relevant criterion\n", |
| 875 | + "## Check the MVP status of the Data registry publications\n", |
874 | 876 | "\n",
|
875 |
| - "Use this notebook to check which publications in the Data Registry passes the Relevant criterion" |
| 877 | + "Use this notebook to check which publications in the Data Registry pass the MVP Relevant and Active criteria, for example, for updating the MEL1 tracker upon OCP Rapid Reflection meetings." |
876 | 878 | ]
|
877 | 879 | },
|
878 | 880 | {
|
|
882 | 884 | "collapsed": false
|
883 | 885 | },
|
884 | 886 | "outputs": [],
|
885 |
| - "source": [] |
| 887 | + "source": [ |
| 888 | + "# @title Get all the publications from the registry { display-mode: \"form\" }\n", |
| 889 | + "\n", |
| 890 | + "publications = get_publications()" |
| 891 | + ] |
| 892 | + }, |
| 893 | + { |
| 894 | + "cell_type": "markdown", |
| 895 | + "metadata": { |
| 896 | + "collapsed": false |
| 897 | + }, |
| 898 | + "source": [ |
| 899 | + "### Check for non-frozen publications whose latest data has not been updated in the previous four calendar quarters\n", |
| 900 | + "\n", |
| 901 | + "From the list, check also the \"last_retrieved\" and \"update_frequency\" columns. If the data is not being retrieved, check the publication log in the Data Registry to check if there is a problem with either a job or the source data itself." |
| 902 | + ] |
| 903 | + }, |
| 904 | + { |
| 905 | + "cell_type": "code", |
| 906 | + "execution_count": null, |
| 907 | + "metadata": { |
| 908 | + "collapsed": false |
| 909 | + }, |
| 910 | + "outputs": [], |
| 911 | + "source": [ |
| 912 | + "non_frozen_publications = list(filter(lambda item: not item[\"frozen\"] and item[\"date_to\"], publications))\n", |
| 913 | + "past_year = datetime.now() - relativedelta(years=1)\n", |
| 914 | + "lapsed_publications = list(\n", |
| 915 | + " filter(lambda item: datetime.strptime(item[\"date_to\"], \"%Y-%m-%d\") < past_year, non_frozen_publications)\n", |
| 916 | + ")\n", |
| 917 | + "lapsed_publications_table = pd.DataFrame(lapsed_publications)\n", |
| 918 | + "lapsed_publications_table" |
| 919 | + ] |
| 920 | + }, |
| 921 | + { |
| 922 | + "cell_type": "markdown", |
| 923 | + "metadata": { |
| 924 | + "collapsed": false |
| 925 | + }, |
| 926 | + "source": [ |
| 927 | + "### Check non-relevant publications\n", |
| 928 | + "Check which active publications pass and not pass the \"Relevant\" criterion." |
| 929 | + ] |
| 930 | + }, |
| 931 | + { |
| 932 | + "cell_type": "code", |
| 933 | + "execution_count": null, |
| 934 | + "metadata": { |
| 935 | + "collapsed": false |
| 936 | + }, |
| 937 | + "outputs": [], |
| 938 | + "source": [ |
| 939 | + "results = []\n", |
| 940 | + "active_publications = [item for item in non_frozen_publications if item not in lapsed_publications]\n", |
| 941 | + "for publication in active_publications:\n", |
| 942 | + " year = publication[\"date_to\"][:4]\n", |
| 943 | + " if int(year) > datetime.now().year:\n", |
| 944 | + " year = datetime.now().year\n", |
| 945 | + " file_name = download_file(publication, year)\n", |
| 946 | + " field_table = cardinal_calculate_coverage(file_name)\n", |
| 947 | + " fields_list = field_table.iloc[:, 0].tolist()\n", |
| 948 | + " relevant, relevant_table = is_relevant(fields_list)\n", |
| 949 | + " relevant_table[\"publisher\"] = publication[\"label\"]\n", |
| 950 | + " relevant_table[\"relevant\"] = relevant\n", |
| 951 | + " results.append(relevant_table)" |
| 952 | + ] |
| 953 | + }, |
| 954 | + { |
| 955 | + "cell_type": "markdown", |
| 956 | + "metadata": { |
| 957 | + "collapsed": false |
| 958 | + }, |
| 959 | + "source": [ |
| 960 | + "Filter the non-relevant ones" |
| 961 | + ] |
| 962 | + }, |
| 963 | + { |
| 964 | + "cell_type": "code", |
| 965 | + "execution_count": null, |
| 966 | + "metadata": { |
| 967 | + "collapsed": false |
| 968 | + }, |
| 969 | + "outputs": [], |
| 970 | + "source": [ |
| 971 | + "result = pd.concat(results)\n", |
| 972 | + "not_relevant_publishers = result[~result[\"relevant\"]]\n", |
| 973 | + "non_relevant_rules = (\n", |
| 974 | + " not_relevant_publishers[not_relevant_publishers[\"possible_to_calculate\"] == \"No\"]\n", |
| 975 | + " .groupby(\"publisher\")\n", |
| 976 | + " .apply(lambda x: \", \".join(x[\"rule\"].astype(str) + \": \" + x[\"missing_fields\"].astype(str)))\n", |
| 977 | + " .reset_index()\n", |
| 978 | + " .rename(columns={0: \"failed rules\"})\n", |
| 979 | + ")" |
| 980 | + ] |
| 981 | + }, |
| 982 | + { |
| 983 | + "cell_type": "markdown", |
| 984 | + "metadata": { |
| 985 | + "collapsed": false |
| 986 | + }, |
| 987 | + "source": [ |
| 988 | + "Check the results" |
| 989 | + ] |
| 990 | + }, |
| 991 | + { |
| 992 | + "cell_type": "code", |
| 993 | + "execution_count": null, |
| 994 | + "metadata": { |
| 995 | + "collapsed": false |
| 996 | + }, |
| 997 | + "outputs": [], |
| 998 | + "source": [ |
| 999 | + "non_relevant_rules" |
| 1000 | + ] |
886 | 1001 | }
|
887 | 1002 | ],
|
888 | 1003 | "metadata": {
|
|
0 commit comments