From c22eedc1fe287389ba56c523fb7b8b72b89f8e1c Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:02:29 -0300 Subject: [PATCH 1/9] update airflow providers --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index cb25d96..153d936 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,8 +14,8 @@ USER airflow COPY requirements-uninstall.txt . RUN pip uninstall -y -r requirements-uninstall.txt && \ pip install --no-cache-dir \ - apache-airflow-providers-microsoft-mssql==3.8.0 \ - apache-airflow-providers-common-sql==1.15.0 + apache-airflow-providers-microsoft-mssql==3.9.0 \ + apache-airflow-providers-common-sql==1.16.0 # Copy and install requirements.txt COPY tests-requirements.txt . From 617eb6837af6ed5f0757e7207a04eaa814d87692 Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:02:54 -0300 Subject: [PATCH 2/9] create new jobs for inlabs automation --- Makefile | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d548329..410be5f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,12 @@ .PHONY: run -run: setup-containers create-example-variable +run: \ +setup-containers \ +create-example-variable \ +create-path-tmp-variable \ +create-inlabs-db \ +create-inlabs-db-connection \ +create-inlabs-portal-connection \ +activate-inlabs-load-dag setup-containers: docker compose up -d --force-recreate --remove-orphans @@ -22,6 +29,78 @@ create-example-variable: }' > /dev/null; \ fi" +create-path-tmp-variable: + @echo "Creating 'path_tmp' Airflow variable" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/variables/path_tmp' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/variables' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"key\": \"path_tmp\", \ + \"value\": \"/tmp\" \ + }' > /dev/null; \ + fi" + +create-inlabs-db: + @docker exec -e PGPASSWORD=airflow ro-dou-postgres-1 sh -c "psql -q -U airflow -f /sql/init-db.sql > /dev/null" + +create-inlabs-db-connection: + @echo "Creating 'inlabs_db' Airflow connection" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/connections/inlabs_db' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"connection_id\": \"inlabs_db\", \ + \"conn_type\": \"postgres\", \ + \"schema\": \"inlabs\", \ + \"host\": \"ro-dou-postgres-1\", \ + \"login\": \"airflow\", \ + \"password\": \"airflow\", \ + \"port\": 5432 \ + }' > /dev/null; \ + fi" + +create-inlabs-portal-connection: + @echo "Creating 'inlabs_portal' Airflow connection" + @docker exec airflow-webserver sh -c \ + "if ! curl -f -s -LI 'http://localhost:8080/api/v1/connections/inlabs_portal' --user \"airflow:airflow\" > /dev/null; \ + then \ + curl -s -X 'POST' \ + 'http://localhost:8080/api/v1/connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"connection_id\": \"inlabs_portal\", \ + \"conn_type\": \"http\", \ + \"description\": \"Credencial para acesso no Portal do INLabs\", \ + \"host\": \"https://inlabs.in.gov.br/\", \ + \"login\": \"user@email.com\", \ + \"password\": \"password\" \ + }' > /dev/null; \ + fi" + +activate-inlabs-load-dag: + @echo "Activating 'dou_inlabs_load_pg' Airflow DAG" + @docker exec airflow-webserver sh -c \ + "curl -s -X 'PATCH' \ + 'http://localhost:8080/api/v1/dags/ro-dou_inlabs_load_pg' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + --user \"airflow:airflow\" \ + -d '{ \ + \"is_paused\": false \ + }' > /dev/null;" + .PHONY: down down: docker compose down From b4e6942d90ceebb443228f1b9493f19dc70cba8b Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:03:31 -0300 Subject: [PATCH 3/9] update dataset name and remove comments --- dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py | 28 ++++---------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py b/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py index 8314408..81ed6f8 100644 --- a/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py +++ b/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py @@ -20,11 +20,8 @@ # Constants DEST_DIR = "download_inlabs" -#XXX update here DEST_CONN_ID = "inlabs_db" -#XXX connection to https://inlabs.in.gov.br/ INLABS_CONN_ID = "inlabs_portal" -#XXX remember to create schema `dou_inlabs` on db STG_TABLE = "dou_inlabs.article_raw" @@ -143,6 +140,7 @@ def _unzip_files(): files_exists = _download_files() _unzip_files() + return files_exists @task @@ -225,7 +223,7 @@ def check_if_first_run_of_day(): if execution_date.day == prev_execution_date.day: logging.info ("Não é a primeira execução do dia") logging.info ("Triggering dataset edicao_extra") - return "trigger_dataset_edicao_extra" + return "trigger_dataset_inlabs_edicao_extra" else: logging.info ("Primeira execução do dia") logging.info ("Triggering dataset e DAGs do INLABS") @@ -247,29 +245,13 @@ def remove_directory(): subprocess.run(f"rm -rf {dest_path}", shell=True, check=True) logging.info("Directory %s removed.", dest_path) - # @task_group(group_id='datasets') - # def trigger_datasets(): - # @task.run_if(lambda context: context["task_instance"].execution_date.hour == 15) - # @task(outlets=[Dataset("inlabs")]) - # def trigger_dataset_edicao_normal(): - # logging.info("Disparando DAGs do INLABS") - - # @task.run_if(lambda context: context["task_instance"].execution_date.hour > 15) - # @task(outlets=[Dataset("inlabs_edicao_extra")]) - # def trigger_dataset_edicao_extra(**kwargs): - # logging.info(context["task_instance"]) - # logging.info("Atualizando o Dataset de Edição Extra") - - # trigger_dataset_edicao_normal(), trigger_dataset_edicao_extra() - - - ## Orchestration + ## Orchestration trigger_date = get_date() download_n_unzip_files(trigger_date) >> \ load_data(trigger_date) >> check_loaded_data >> \ + remove_directory() >> \ check_if_first_run_of_day() >> \ - [trigger_dataset_inlabs_edicao_extra(),trigger_dataset_inlabs()] >> \ - remove_directory() + [trigger_dataset_inlabs_edicao_extra(),trigger_dataset_inlabs()] load_inlabs() From 80898bf05b03086632d5d19a21cd667e2d9f6e33 Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:03:55 -0300 Subject: [PATCH 4/9] update install docs --- docs/docs/como_utilizar/instalacao.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/docs/como_utilizar/instalacao.md b/docs/docs/como_utilizar/instalacao.md index 436bbb2..e1cd2ac 100644 --- a/docs/docs/como_utilizar/instalacao.md +++ b/docs/docs/como_utilizar/instalacao.md @@ -33,7 +33,8 @@ Agora, faremos um segundo teste: o clipping **terms_from_variable**, seguindo os Leia a seção **Configurando em Produção** para instalar o Ro-dou utilizando um provedor SMTP real que enviará os e-mails para os destinatários verdadeiros. -**Observação:** Para utilizar o `source: - INLABS`, é necessário criar a conexão `inlabs_db` no Apache Airflow, apontando para o banco `Postgres` que está carregado com os dados do inlabs. Você poderá encontrar aqui um exemplo de como carregar um banco com os dados do inlabs: [`ro-dou_inlabs_load_pg_dag.py`](https://github.com/gestaogovbr/Ro-dou/blob/main/dag_load_inlabs/ro-dou_inlabs_load_pg_dag.py). +**Observação:** Para utilizar o `source: - INLABS`, é necessário alterar a conexão `inlabs_portal` no Apache Airflow, apontando o usuário e senha de autenticação do portal. Um novo usuário pode ser cadastrado pelo portal [INLABS](https://inlabs.in.gov.br/acessar.php). A DAG +que realiza o download dos arquivos do INLABS é a **ro-dou_inlabs_load_pg**. Quando tiver terminado de utilizar o ambiente de teste do Ro-DOU, desligue-o por meio do seguinte comando: From ec34ba6f36530737cd2037871546d8edb9152b1f Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:05:33 -0300 Subject: [PATCH 5/9] add volumes for logs, inlabs_dag and init sql --- docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 5eb41ef..f25fbf1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,7 +73,9 @@ volumes: - ./src:/opt/airflow/dags/ro_dou_src # for development purpose + - ./mnt/logs:/opt/airflow/logs - ./dag_confs:/opt/airflow/dags/ro_dou/dag_confs + - ./dag_load_inlabs:/opt/airflow/dags/dag_load_inlabs - ./tests:/opt/airflow/tests # for test purpose - ./schemas:/opt/airflow/schemas # for test purpose depends_on: @@ -89,6 +91,7 @@ POSTGRES_DB: airflow volumes: - ./mnt/pgdata:/var/lib/postgresql/data + - ./dag_load_inlabs/sql/:/sql healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] interval: 5s From e834cb3ec590bca582e0ac702238fafff18d322b Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Thu, 31 Oct 2024 18:06:04 -0300 Subject: [PATCH 6/9] create init sql scripts for inlabs database --- dag_load_inlabs/sql/init-db.sql | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 dag_load_inlabs/sql/init-db.sql diff --git a/dag_load_inlabs/sql/init-db.sql b/dag_load_inlabs/sql/init-db.sql new file mode 100644 index 0000000..4dcc807 --- /dev/null +++ b/dag_load_inlabs/sql/init-db.sql @@ -0,0 +1,5 @@ +CREATE DATABASE inlabs; + +\c inlabs + +CREATE SCHEMA IF NOT EXISTS dou_inlabs; \ No newline at end of file From 17ae2c2171aff2ef908fab7bd5635127c9d6103f Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Fri, 1 Nov 2024 12:16:19 -0300 Subject: [PATCH 7/9] add step to create logs folder --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 410be5f..2cd746c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ .PHONY: run run: \ +create-logs-dir \ setup-containers \ create-example-variable \ create-path-tmp-variable \ @@ -8,6 +9,9 @@ create-inlabs-db-connection \ create-inlabs-portal-connection \ activate-inlabs-load-dag +create-logs-dir: + mkdir -p ./airflow-logs -m a=rwx + setup-containers: docker compose up -d --force-recreate --remove-orphans @@ -46,6 +50,7 @@ create-path-tmp-variable: fi" create-inlabs-db: + @echo "Creating 'inlabs' database" @docker exec -e PGPASSWORD=airflow ro-dou-postgres-1 sh -c "psql -q -U airflow -f /sql/init-db.sql > /dev/null" create-inlabs-db-connection: From 8e6b782b0501db8566f3e1ec045b9f7f0e3f0f52 Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Fri, 1 Nov 2024 12:20:03 -0300 Subject: [PATCH 8/9] change folder name --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2cd746c..2812d9c 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ create-inlabs-portal-connection \ activate-inlabs-load-dag create-logs-dir: - mkdir -p ./airflow-logs -m a=rwx + mkdir -p ./mnt/airflow-logs -m a=rwx setup-containers: docker compose up -d --force-recreate --remove-orphans From cae6fdacfebc7c42f4467cd9e0183487e822506a Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Fri, 1 Nov 2024 12:20:17 -0300 Subject: [PATCH 9/9] change folder name --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index f25fbf1..94e7424 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,7 +73,7 @@ volumes: - ./src:/opt/airflow/dags/ro_dou_src # for development purpose - - ./mnt/logs:/opt/airflow/logs + - ./mnt/airflow-logs:/opt/airflow/logs - ./dag_confs:/opt/airflow/dags/ro_dou/dag_confs - ./dag_load_inlabs:/opt/airflow/dags/dag_load_inlabs - ./tests:/opt/airflow/tests # for test purpose