diff --git a/README.md b/README.md index 424397b..e6ec984 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Notebook | OpenĀ inĀ Colab | Description [Relevant checks for all the Data Registry publications](https://github.com/open-contracting/notebooks-ocds/blob/main/template_relevant_checks_registry_all.ipynb) | [![Open Iinn Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-contracting/notebooks-ocds/blob/main/template_relevant_checks_registry_all.ipynb) | Provide feedback on data relevance downloading all the publications from the [Data Registry](https://data.open-contracting.org/). [Red flags checks using the Data Registry](https://github.com/open-contracting/notebooks-ocds/blob/main/template_red_flags_checks_registry.ipynb) | [![Open Iinn Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-contracting/notebooks-ocds/blob/main/template_red_flags_checks_registry.ipynb) | Provide feedback on coverage for red flags using data from the [Data Registry](https://data.open-contracting.org/). [Red flags checks using a field list](https://github.com/open-contracting/notebooks-ocds/blob/main/template_red_flags_checks_fieldlist.ipynb) | [![Open Iinn Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-contracting/notebooks-ocds/blob/main/template_red_flags_checks_fieldlist.ipynb) | Provide feedback on red flags for prospective OCDS publishers, using a field list, like from a field-level mapping. - +[Field list for all the Data Registry publications](https://github.com/open-contracting/notebooks-ocds/blob/main/template_field_list_registry_all.ipynb) | [![Open Iinn Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-contracting/notebooks-ocds/blob/main/template_field_list_registry_all.ipynb) | Extract the fields published by all the the publications from the [Data Registry](https://data.open-contracting.org/). ## Contributing diff --git a/component_get_field_list_all_registry.ipynb b/component_get_field_list_all_registry.ipynb new file mode 100644 index 0000000..5d5b20a --- /dev/null +++ b/component_get_field_list_all_registry.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Get the fields used by all OCDS publications in the Registry\n", + "\n", + "Use this notebook to get the list of the fields implemented by all the publishers in the Data Registry, for example, to check what publishers are publishing specific fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# @title Get all the publications from the registry { display-mode: \"form\" }\n", + "\n", + "publications = get_publications()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Download all the publications, using the latest file available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "today = datetime.now(tz=timezone.utc)\n", + "results = []\n", + "for publication in publications:\n", + " if publication[\"date_to\"]:\n", + " year = publication[\"date_to\"][:4]\n", + " if int(year) > today.year:\n", + " year = today.year\n", + " else:\n", + " year = \"full\"\n", + " download_file(publication, year)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Extract the list of fields using cardinal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset = pd.DataFrame()\n", + "\n", + "for file in os.listdir(\".\"):\n", + " if file.endswith(\".jsonl\"):\n", + " publisher = file.replace(\".jsonl\", \"\")\n", + " coverage = !./ocdscardinal coverage $file\n", + " data = (\n", + " pd.DataFrame.from_dict(json.loads(coverage[0]), orient=\"index\", columns=[\"count\"])\n", + " .reset_index()\n", + " .rename(columns={\"index\": \"path\"})\n", + " )\n", + " data[\"publisher\"] = publisher\n", + " final_dataset = pd.concat([final_dataset, data])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Export the results as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset.to_csv(\"ocds_fields_from_all_publishers.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/manage.py b/manage.py index 2c214c3..89763b7 100755 --- a/manage.py +++ b/manage.py @@ -122,6 +122,12 @@ "component_setup_usability", "component_check_red_flags_external", ], + "template_field_list_registry_all": [ + "component_environment", + "component_setup_cardinal", + "component_setup_download_data_from_registry", + "component_get_field_list_all_registry", + ], } BASEDIR = Path(__file__).resolve().parent diff --git a/template_field_list_registry_all.ipynb b/template_field_list_registry_all.ipynb new file mode 100644 index 0000000..db1ee81 --- /dev/null +++ b/template_field_list_registry_all.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sfg_SbQWBCmW" + }, + "source": [ + "## Setup\n", + "\n", + "*You must run the cells in this section each time you connect to a new runtime. For example, when you return to the notebook after an idle timeout, when the runtime crashes, or when you restart or factory reset the runtime.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sowWi_Ve_Lm3" + }, + "source": [ + "Install requirements (*Note: ocdskingfishercolab installs google-colab, which expects specific versions of pandas and numpy*):\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X4nmyvOa_Ls7" + }, + "outputs": [], + "source": [ + "! pip install --upgrade pip > pip.log\n", + "! pip install --upgrade 'ocdskingfishercolab<0.4' ipywidgets psycopg2-binary >> pip.log" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tezDNj4uAOHq" + }, + "outputs": [], + "source": [ + "# @title Import packages and load extensions { display-mode: \"form\" }\n", + "\n", + "import gzip\n", + "import json\n", + "import os\n", + "import shutil\n", + "import tempfile\n", + "from collections import Counter\n", + "from datetime import datetime, timezone\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from dateutil.relativedelta import relativedelta\n", + "from google.colab.data_table import DataTable\n", + "from google.colab.files import download\n", + "from ipywidgets import widgets\n", + "from ocdskingfishercolab import (\n", + " authenticate_gspread,\n", + " calculate_coverage,\n", + " download_dataframe_as_csv,\n", + " format_thousands,\n", + " render_json,\n", + " save_dataframe_to_sheet,\n", + " save_dataframe_to_spreadsheet,\n", + " set_dark_mode,\n", + " set_light_mode,\n", + " set_spreadsheet_name,\n", + ")\n", + "\n", + "# Load https://pypi.org/project/ipython-sql/\n", + "%load_ext sql\n", + "# Load https://colab.research.google.com/notebooks/data_table.ipynb\n", + "%load_ext google.colab.data_table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "--8vgOiP_58f" + }, + "outputs": [], + "source": [ + "# @title Configure the notebook environment { display-mode: \"form\" }\n", + "\n", + "# Increase max columns so that Pandas DataFrames with many columns are rendered as data tables.\n", + "DataTable.max_columns = 50\n", + "# Remove the index from data tables for easier copy-pasting to Google Docs.\n", + "DataTable.include_index = False\n", + "\n", + "# Return Pandas DataFrames instead of regular result sets.\n", + "%config SqlMagic.autopandas = True\n", + "# Don't print number of rows affected.\n", + "%config SqlMagic.feedback = False\n", + "\n", + "# If you set Tools > Settings > Site > Theme to dark, uncomment this line.\n", + "# set_dark_mode()\n", + "# If you are creating plots to copy-paste into reports, uncomment this line.\n", + "# set_light_mode()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "25lVf3PwsmmV" + }, + "source": [ + "## Setup Cardinal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6fzwlrSLOF3w" + }, + "source": [ + "### Install Cardinal" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1H6SWBCTDq5G" + }, + "source": [ + "This notebook uses [Cardinal](https://cardinal.readthedocs.io/en/latest/), a Rust package to calculate red flags and the coverage of OCDS data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6syz0fkkEdgj" + }, + "outputs": [], + "source": [ + "# @title Install { display-mode: \"form\" }\n", + "\n", + "! curl -sSOL https://github.com/open-contracting/cardinal-rs/releases/download/0.0.5/ocdscardinal-0.0.5-linux-64-bit.zip\n", + "! unzip -oj ocdscardinal-0.0.5-linux-64-bit.zip ocdscardinal-0.0.5-linux-64-bit/ocdscardinal\n", + "\n", + "def cardinal_calculate_coverage(file_name):\n", + " coverage = !./ocdscardinal coverage $file_name\n", + " fields = (\n", + " pd.DataFrame.from_dict(json.loads(coverage[0]), orient=\"index\", columns=[\"count\"])\n", + " .reset_index()\n", + " .rename(columns={\"index\": \"path\"})\n", + " )\n", + " # Leaves only object members\n", + " fields_table = fields[fields.path.str.contains(\"[a-z]$\")].copy()\n", + " fields_table[\"path\"] = fields_table[\"path\"].str.replace(r\"[][]|^/\", \"\", regex=True)\n", + " return fields_table" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vpPltvi1tjEs" + }, + "source": [ + "## Setup download data from the Data Registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CLc-_oJntg93" + }, + "outputs": [], + "source": [ + "# @title Data registry functions{ display-mode: \"form\" }\n", + "import requests\n", + "\n", + "DATA_REGISTRY_BASE_URL = \"https://data.open-contracting.org/en/\"\n", + "PUBLICATIONS_URL = f\"{DATA_REGISTRY_BASE_URL}publications.json\"\n", + "\n", + "\n", + "def get_publications():\n", + " publications = requests.get(PUBLICATIONS_URL, timeout=10).json()\n", + " for publication in publications:\n", + " publication[\"label\"] = f\"{publication['country']} - {publication['title']}\"\n", + " return publications\n", + "\n", + "\n", + "def get_publication_select_box():\n", + " return widgets.Dropdown(\n", + " options=sorted([entry[\"label\"] for entry in get_publications()]),\n", + " description=\"Publication:\",\n", + " disabled=False,\n", + " )\n", + "\n", + "\n", + "def get_available_years(publication):\n", + " years = [\"full\"]\n", + " if publication[\"date_from\"] and publication[\"date_to\"]:\n", + " year_from = int(publication[\"date_from\"][:4])\n", + " year_to = int(publication[\"date_to\"][:4])\n", + " years.extend(list(range(year_from, year_to + 1)))\n", + " return years\n", + "\n", + "\n", + "def get_years_select_box(publication_select_box):\n", + " selected_publication = next(\n", + " filter(lambda entry: entry[\"label\"] == publication_select_box.value, get_publications())\n", + " )\n", + " return (\n", + " widgets.Dropdown(\n", + " options=get_available_years(selected_publication),\n", + " description=\"Year:\",\n", + " disabled=False,\n", + " ),\n", + " selected_publication,\n", + " )\n", + "\n", + "\n", + "def download_file(selected_publication, selected_year):\n", + " file_name = f\"{selected_publication['source_id']}-{selected_year}.jsonl\"\n", + " download_url = (\n", + " f'{DATA_REGISTRY_BASE_URL}publication/{selected_publication[\"id\"]}/download?name={selected_year}.jsonl.gz'\n", + " )\n", + " response = requests.get(download_url, timeout=10)\n", + " with tempfile.NamedTemporaryFile() as gz_file:\n", + " gz_file.write(response.content)\n", + " with gzip.open(gz_file.name) as i, Path(file_name).open(\"wb\") as o:\n", + " shutil.copyfileobj(i, o)\n", + " return file_name" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Get the fields used by all OCDS publications in the Registry\n", + "\n", + "Use this notebook to get the list of the fields implemented by all the publishers in the Data Registry, for example, to check what publishers are publishing specific fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# @title Get all the publications from the registry { display-mode: \"form\" }\n", + "\n", + "publications = get_publications()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Download all the publications, using the latest file available" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "today = datetime.now(tz=timezone.utc)\n", + "results = []\n", + "for publication in publications:\n", + " if publication[\"date_to\"]:\n", + " year = publication[\"date_to\"][:4]\n", + " if int(year) > today.year:\n", + " year = today.year\n", + " else:\n", + " year = \"full\"\n", + " download_file(publication, year)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Extract the list of fields using cardinal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset = pd.DataFrame()\n", + "\n", + "for file in os.listdir(\".\"):\n", + " if file.endswith(\".jsonl\"):\n", + " publisher = file.replace(\".jsonl\", \"\")\n", + " coverage = !./ocdscardinal coverage $file\n", + " data = (\n", + " pd.DataFrame.from_dict(json.loads(coverage[0]), orient=\"index\", columns=[\"count\"])\n", + " .reset_index()\n", + " .rename(columns={\"index\": \"path\"})\n", + " )\n", + " data[\"publisher\"] = publisher\n", + " final_dataset = pd.concat([final_dataset, data])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Export the results as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "final_dataset.to_csv(\"ocds_fields_from_all_publishers.csv\", index=False)" + ] + } + ], + "metadata": { + "colab": { + "name": "template_field_list_registry_all", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}