Skip to content

Commit 05b847a

Browse files
authored
Merge pull request #1 from trocco-io/develop
init
2 parents a2c8ade + d9486f6 commit 05b847a

29 files changed

+2382
-1
lines changed

.github/workflows/main.yml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
name: main
2+
3+
on:
4+
push:
5+
branches:
6+
- 'main'
7+
tags:
8+
- '*'
9+
pull_request:
10+
branches:
11+
- 'main'
12+
types: [opened, synchronize]
13+
pull_request_target:
14+
branches:
15+
- 'main'
16+
types: [labeled]
17+
18+
jobs:
19+
test:
20+
name: test
21+
runs-on: ubuntu-latest
22+
if: >
23+
${{
24+
github.event_name == 'pull_request' ||
25+
(github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'safe to test')) ||
26+
startsWith(github.ref, 'refs/tags/')
27+
}}
28+
steps:
29+
- uses: actions/checkout@v2
30+
- name: Set up JDK 1.8
31+
uses: actions/setup-java@v1
32+
with:
33+
java-version: 1.8
34+
- name: lint
35+
run: ./gradlew spotlessCheck
36+
- name: Test
37+
run: ./gradlew test
38+
build:
39+
name: Build + Publish
40+
runs-on: ubuntu-latest
41+
permissions:
42+
packages: write
43+
contents: read
44+
needs: [ test ]
45+
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.ref, 'refs/tags/') }}
46+
steps:
47+
- uses: actions/checkout@v2
48+
- name: Set up Ruby 2.7
49+
uses: ruby/setup-ruby@v1
50+
with:
51+
ruby-version: 2.7
52+
- name: push gem
53+
uses: trocco-io/push-gem-to-gpr-action@v1
54+
with:
55+
language: java
56+
gem-path: "./build/gems/*.gem"
57+
github-token: "${{ secrets.GITHUB_TOKEN }}"

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
*~
2+
/pkg/
3+
/tmp/
4+
*.gemspec
5+
.gradle/
6+
/classpath/
7+
build/
8+
.idea
9+
/.settings/
10+
/.metadata/
11+
.classpath
12+
.project
13+
config.yml
14+
default_jdbc_driver
15+
/bin/
16+
example/test.yml

LICENSE.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
2+
MIT License
3+
4+
Permission is hereby granted, free of charge, to any person obtaining
5+
a copy of this software and associated documentation files (the
6+
"Software"), to deal in the Software without restriction, including
7+
without limitation the rights to use, copy, modify, merge, publish,
8+
distribute, sublicense, and/or sell copies of the Software, and to
9+
permit persons to whom the Software is furnished to do so, subject to
10+
the following conditions:
11+
12+
The above copyright notice and this permission notice shall be
13+
included in all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,186 @@
1-
# embulk-input-databricks
1+
# Databricks input plugin for Embulk
2+
3+
Databricks input plugin for Embulk loads records from Databricks.
4+
5+
## Overview
6+
7+
* **Plugin type**: input
8+
* **Resume supported**: yes
9+
10+
## Configuration
11+
12+
- **driver_path**: path to the jar file of the JDBC driver. If not set, [the bundled JDBC driver](https://docs.databricks.com/en/integrations/jdbc/index.html) will be used. (string, optional)
13+
- **server_hostname**: The Databricks compute resource’s Server Hostname value, see [Compute settings for the Databricks JDBC Driver](https://docs.databricks.com/en/integrations/jdbc/compute.html). (string, required)
14+
- **http_path**: The Databricks compute resource’s HTTP Path value, see [Compute settings for the Databricks JDBC Driver](https://docs.databricks.com/en/integrations/jdbc/compute.html). (string, required)
15+
- **personal_access_token**: The Databaricks personal_access_token, see [Authentication settings for the Databricks JDBC Driver](https://docs.databricks.com/en/integrations/jdbc/authentication.html#authentication-pat). (string, required)
16+
- **catalog_name**: destination catalog name (string, optional)
17+
- **schema_name**: destination schema name (string, optional)
18+
- **where**: WHERE condition to filter the rows (string, default: no-condition)
19+
- **fetch_rows**: number of rows to fetch one time (used for java.sql.Statement#setFetchSize) (integer, default: 10000)
20+
- **connect_timeout**: timeout for establishment of a database connection. (integer (seconds), default: 300)
21+
- **socket_timeout**: timeout for socket read operations. 0 means no timeout. (integer (seconds), default: 1800)
22+
- If you write SQL directly,
23+
- **query**: SQL to run (string)
24+
- If **query** is not set,
25+
- **table**: destination table name (string, required)
26+
- **select**: expression of select (e.g. `id, created_at`) (string, default: "*")
27+
- **where**: WHERE condition to filter the rows (string, default: no-condition)
28+
- **order_by**: expression of ORDER BY to sort rows (e.g. `created_at DESC, id ASC`) (string, default: not sorted)
29+
- **incremental**: if true, enables incremental loading. See next section for details (boolean, default: false)
30+
- **incremental_columns**: column names for incremental loading (array of strings, default: use primary keys). Columns of integer types, string types, `timestamp` are supported.
31+
- **last_record**: values of the last record for incremental loading (array of objects, default: load all records)
32+
- **default_timezone**: If the sql type of a column is `date`/`time`/`datetime` and the embulk type is `string`, column values are formatted int this default_timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
33+
- **default_column_options**: advanced: column_options for each JDBC type as default. key-value pairs where key is a JDBC type (e.g. 'DATE', 'BIGINT') and value is same as column_options's value.
34+
- **column_options**: advanced: key-value pairs where key is a column name and value is options for the column.
35+
- **value_type**: embulk get values from database as this value_type. Typically, the value_type determines `getXXX` method of `java.sql.PreparedStatement`.
36+
(string, default: depends on the sql type of the column. Available values options are: `long`, `double`, `float`, `decimal`, `boolean`, `string`, `json`, `date`, `time`, `timestamp`)
37+
- **type**: Column values are converted to this embulk type.
38+
Available values options are: `boolean`, `long`, `double`, `string`, `json`, `timestamp`).
39+
By default, the embulk type is determined according to the sql type of the column (or value_type if specified).
40+
- **timestamp_format**: If the sql type of the column is `date`/`time`/`datetime` and the embulk type is `string`, column values are formatted by this timestamp_format. And if the embulk type is `timestamp`, this timestamp_format may be used in the output plugin. For example, stdout plugin use the timestamp_format, but *csv formatter plugin doesn't use*. (string, default : `%Y-%m-%d` for `date`, `%H:%M:%S` for `time`, `%Y-%m-%d %H:%M:%S` for `timestamp`)
41+
- **timezone**: If the sql type of the column is `date`/`time`/`datetime` and the embulk type is `string`, column values are formatted in this timezone.
42+
(string, value of default_timezone option is used by default)
43+
- **before_setup**: if set, this SQL will be executed before setup. You can prepare table for input by this option.
44+
- **before_select**: if set, this SQL will be executed before the SELECT query. (Other plugins execute query in the same transaction, but Databricks does not support transaction in multi statement, so this plugin does not support it.)
45+
- **after_select**: if set, this SQL will be executed after the SELECT query. (Other plugins execute query in the same transaction, but Databricks does not support transaction in multi statement, so this plugin does not support it.)
46+
47+
48+
### Incremental loading
49+
50+
Incremental loading uses monotonically increasing unique columns (such as auto-increment (IDENTITY) column) to load records inserted (or updated) after last execution.
51+
52+
First, if `incremental: true` is set, this plugin loads all records with additional ORDER BY. For example, if `incremental_columns: [updated_at, id]` option is set, query will be as following:
53+
54+
```
55+
SELECT * FROM (
56+
...original query is here...
57+
)
58+
ORDER BY updated_at, id
59+
```
60+
61+
When bulk data loading finishes successfully, it outputs `last_record: ` paramater as config-diff so that next execution uses it.
62+
63+
At the next execution, when `last_record: ` is also set, this plugin generates additional WHERE conditions to load records larger than the last record. For example, if `last_record: ["2017-01-01T00:32:12.487659", 5291]` is set,
64+
65+
```
66+
SELECT * FROM (
67+
...original query is here...
68+
)
69+
WHERE updated_at > '2017-01-01 00:32:12.487659' OR (updated_at = '2017-01-01 00:32:12.487659' AND id > 5291)
70+
ORDER BY updated_at, id
71+
```
72+
73+
Then, it updates `last_record: ` so that next execution uses the updated last_record.
74+
75+
**IMPORTANT**: If you set `incremental_columns: ` option, make sure that there is an index on the columns to avoid full table scan. For this example, following index should be created:
76+
77+
```
78+
CREATE INDEX embulk_incremental_loading_index ON table (updated_at, id);
79+
```
80+
81+
Recommended usage is to leave `incremental_columns` unset and let this plugin automatically finds an auto-increment (IDENTITY) primary key. Currently, only strings, integers, TIMESTAMP and TIMESTAMPTZ are supported as incremental_columns.
82+
83+
## Example
84+
85+
```yaml
86+
in:
87+
type: databricks
88+
server_hostname: dbc-xxxx.cloud.databricks.com
89+
http_path: /sql/1.0/warehouses/xxxxx
90+
personal_access_token: dapixxxxxx
91+
catalog_name: test_catalog
92+
schema_name: test_schema
93+
table: test_date
94+
select: "col1, col2, col3"
95+
where: "col4 != 'a'"
96+
order_by: "col1 DESC"
97+
```
98+
99+
This configuration will generate following SQL:
100+
101+
```
102+
SELECT col1, col2, col3
103+
FROM "my_table"
104+
WHERE col4 != 'a'
105+
ORDER BY col1 DESC
106+
```
107+
108+
If you need a complex SQL,
109+
110+
```yaml
111+
in:
112+
type: databricks
113+
server_hostname: dbc-xxxx.cloud.databricks.com
114+
http_path: /sql/1.0/warehouses/xxxxx
115+
personal_access_token: dapixxxxxx
116+
catalog_name: test_catalog
117+
query: |
118+
SELECT t1.id, t1.name, t2.id AS t2_id, t2.name AS t2_name
119+
FROM table1 AS t1
120+
LEFT JOIN table2 AS t2
121+
ON t1.id = t2.t1_id
122+
```
123+
124+
Advanced configuration:
125+
126+
```yaml
127+
in:
128+
type: databricks
129+
server_hostname: dbc-xxxx.cloud.databricks.com
130+
http_path: /sql/1.0/warehouses/xxxxx
131+
personal_access_token: dapixxxxxx
132+
catalog_name: test_catalog
133+
schema_name: test_schema
134+
table: test_date
135+
select: "col1, col2, col3"
136+
where: "col4 != 'a'"
137+
default_column_options:
138+
TIMESTAMP: { type: string, timestamp_format: "%Y/%m/%d %H:%M:%S", timezone: "+0900"}
139+
BIGINT: { type: string }
140+
column_options:
141+
col1: {type: long}
142+
col3: {type: string, timestamp_format: "%Y/%m/%d", timezone: "+0900"}
143+
after_select: "update my_table set col5 = '1' where col4 != 'a'"
144+
145+
```
146+
147+
## NOTE
148+
149+
### Correspondence table for databrick types and JDBC Types
150+
151+
| databrick types | JDBC Types |
152+
|--------------- |----------- |
153+
| BIGINT | BIGINT |
154+
| BINARY | unsupported |
155+
| BOOLEAN | BOOLEAN |
156+
| DATE | DATE |
157+
| DECIMAL | DECIMAL |
158+
| DOUBLE | DOUBLE |
159+
| FLOAT | REAL |
160+
| INT | INTEGER |
161+
| INTERVAL | VARCHAR |
162+
| SMALLINT | SMALLINT |
163+
| STRING | VARCHAR |
164+
| TIMETAMP | TIMESTAMP |
165+
| TIMETAMP\_NTZ | unsupported |
166+
| TINYINT | TINYINT |
167+
| ARRAY | VARCHAR |
168+
| MAP | VARCHAR |
169+
| STRUCT | VARCHAR |
170+
171+
### TIMESTAMP_NTZ
172+
173+
[The official Databricks JDBC driver does not support TIMESTAMP_NTZ](https://docs.databricks.com/en/sql/language-manual/data-types/timestamp-ntz-type.html#notes), so this plugin officially does not support TIMESTAMP_NTZ.
174+
175+
176+
## Build
177+
178+
```
179+
$ ./gradlew gem
180+
```
181+
182+
Running tests:
183+
184+
```
185+
$ EMBULK_INPUT_DATABRICKS_TEST_CONFIG="example/test.yml" ./gradlew test # Create example/test.yml based on example/test.yml.example
186+
```

build.gradle

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
plugins {
2+
id "java"
3+
id "maven-publish"
4+
id "org.embulk.embulk-plugins" version "0.5.5"
5+
id "com.palantir.git-version" version "0.13.0"
6+
id "com.diffplug.spotless" version "5.15.0"
7+
id "com.adarshr.test-logger" version "3.0.0"
8+
}
9+
10+
repositories {
11+
mavenCentral()
12+
}
13+
14+
group = "io.trocco"
15+
description = "Loads records from Databricks."
16+
17+
sourceCompatibility = 1.8
18+
targetCompatibility = 1.8
19+
20+
version = {
21+
def vd = versionDetails()
22+
if (vd.commitDistance == 0 && vd.lastTag ==~ /^v[0-9]+\.[0-9]+\.[0-9]+(\.[a-zA-Z0-9]+)?/) {
23+
vd.lastTag.substring(1)
24+
} else {
25+
"0.0.0.${vd.gitHash}"
26+
}
27+
}()
28+
29+
dependencies {
30+
def embulkVersion = "0.10.36"
31+
32+
compileOnly("org.embulk:embulk-api:${embulkVersion}")
33+
compileOnly("org.embulk:embulk-spi:${embulkVersion}")
34+
compile("org.embulk:embulk-input-jdbc:0.13.2")
35+
compile('com.databricks:databricks-jdbc:2.6.34')
36+
37+
testImplementation "junit:junit:4.+"
38+
testImplementation "org.embulk:embulk-junit4:${embulkVersion}"
39+
testImplementation "org.embulk:embulk-core:${embulkVersion}"
40+
testImplementation "org.embulk:embulk-deps:${embulkVersion}"
41+
testImplementation "org.embulk:embulk-formatter-csv:${embulkVersion}"
42+
testImplementation "org.embulk:embulk-output-file:${embulkVersion}"
43+
44+
//SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
45+
//SLF4J: Defaulting to no-operation (NOP) logger implementation
46+
//SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
47+
testImplementation("org.slf4j:slf4j-simple:1.7.30")
48+
}
49+
50+
embulkPlugin {
51+
mainClass = "org.embulk.input.DatabricksInputPlugin"
52+
category = "input"
53+
type = "databricks"
54+
}
55+
56+
test {
57+
environment "TZ", "UTC"
58+
}
59+
60+
clean {
61+
delete "classpath"
62+
delete 'default_jdbc_driver'
63+
}
64+
65+
gem {
66+
from("LICENSE.txt")
67+
authors = [ "" ]
68+
email = [ "" ]
69+
summary = "Databricks input plugin for Embulk"
70+
homepage = "https://github.com/trocco-io/embulk-input-databricks"
71+
licenses = [ "MIT" ]
72+
}
73+
spotless {
74+
java {
75+
importOrder()
76+
removeUnusedImports()
77+
googleJavaFormat()
78+
toggleOffOn()
79+
}
80+
}
81+

example/test.yml.example

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# The catalog_name.schema_name and another_catalog_name.schema_name must be created in advance.
2+
# The another_catalog_name must be different from catalog_name.
3+
4+
server_hostname:
5+
http_path:
6+
personal_access_token:
7+
catalog_name:
8+
schema_name:
9+
table_prefix:
10+
another_catalog_name:
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# This is a Gradle generated file for dependency locking.
2+
# Manual edits can break the build and are not advised.
3+
# This file is expected to be part of source control.
4+
com.databricks:databricks-jdbc:2.6.34
5+
com.fasterxml.jackson.core:jackson-annotations:2.6.7
6+
com.fasterxml.jackson.core:jackson-core:2.6.7
7+
com.fasterxml.jackson.core:jackson-databind:2.6.7
8+
com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.6.7
9+
javax.validation:validation-api:1.1.0.Final
10+
org.embulk:embulk-api:0.10.36
11+
org.embulk:embulk-input-jdbc:0.13.2
12+
org.embulk:embulk-spi:0.10.36
13+
org.embulk:embulk-util-config:0.3.2
14+
org.embulk:embulk-util-json:0.1.1
15+
org.embulk:embulk-util-timestamp:0.2.1
16+
org.msgpack:msgpack-core:0.8.11
17+
org.slf4j:slf4j-api:1.7.30

0 commit comments

Comments
 (0)