Skip to content

Commit 5fd4e49

Browse files
authored
Merge pull request #71 from trocco-io/feature/match_by_column_name
support match_by_column_name
2 parents 9cfaa02 + 19216a7 commit 5fd4e49

File tree

4 files changed

+147
-1
lines changed

4 files changed

+147
-1
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Snowflake output plugin for Embulk loads records to Snowflake.
2727
- **merge_keys**: key column names for merging records in merge mode (string array, required in merge mode if table doesn't have primary key)
2828
- **merge_rule**: list of column assignments for updating existing records used in merge mode, for example `"foo" = T."foo" + S."foo"` (`T` means target table and `S` means source table). (string array, default: always overwrites with new values)
2929
- **batch_size**: size of a single batch insert (integer, default: 16777216)
30+
- **match_by_column_name**: specify whether to load semi-structured data into columns in the target table that match corresponding columns represented in the data. ("case_sensitive", "case_insensitive", "none", default: "none")
3031
- **default_timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp into a SQL string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
3132
- **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
3233
- **type**: type of a column when this plugin creates new tables (e.g. `VARCHAR(255)`, `INTEGER NOT NULL UNIQUE`). This used when this plugin creates intermediate tables (insert, truncate_insert and merge modes), when it creates the target table (insert_direct and replace modes), and when it creates nonexistent target table automatically. (string, default: depends on input column type. `BIGINT` if input column type is long, `BOOLEAN` if boolean, `DOUBLE PRECISION` if double, `CLOB` if string, `TIMESTAMP` if timestamp)

src/main/java/org/embulk/output/SnowflakeOutputPlugin.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package org.embulk.output;
22

3+
import com.fasterxml.jackson.annotation.JsonCreator;
4+
import com.fasterxml.jackson.annotation.JsonValue;
35
import java.io.IOException;
46
import java.sql.SQLException;
57
import java.sql.Types;
68
import java.util.*;
9+
import java.util.function.BiFunction;
710
import net.snowflake.client.jdbc.internal.org.bouncycastle.operator.OperatorCreationException;
811
import net.snowflake.client.jdbc.internal.org.bouncycastle.pkcs.PKCSException;
912
import org.embulk.config.ConfigDiff;
@@ -78,6 +81,47 @@ public interface SnowflakePluginTask extends PluginTask {
7881
@Config("delete_stage_on_error")
7982
@ConfigDefault("false")
8083
public boolean getDeleteStageOnError();
84+
85+
@Config("match_by_column_name")
86+
@ConfigDefault("\"none\"")
87+
public MatchByColumnName getMatchByColumnName();
88+
89+
public void setCopyIntoTableColumnNames(String[] columnNames);
90+
91+
public String[] getCopyIntoTableColumnNames();
92+
93+
public void setCopyIntoCSVColumnNumbers(int[] columnNumbers);
94+
95+
public int[] getCopyIntoCSVColumnNumbers();
96+
97+
public enum MatchByColumnName {
98+
CASE_SENSITIVE,
99+
CASE_INSENSITIVE,
100+
NONE;
101+
102+
@JsonValue
103+
@Override
104+
public String toString() {
105+
return name().toLowerCase(Locale.ENGLISH);
106+
}
107+
108+
@JsonCreator
109+
public static MatchByColumnName fromString(String value) {
110+
switch (value) {
111+
case "case_sensitive":
112+
return CASE_SENSITIVE;
113+
case "case_insensitive":
114+
return CASE_INSENSITIVE;
115+
case "none":
116+
return NONE;
117+
default:
118+
throw new ConfigException(
119+
String.format(
120+
"Unknown match_by_column_name '%s'. Supported values are case_sensitive, case_insensitive, none",
121+
value));
122+
}
123+
}
124+
}
81125
}
82126

83127
@Override
@@ -187,6 +231,38 @@ protected void doBegin(
187231
JdbcOutputConnection con, PluginTask task, final Schema schema, int taskCount)
188232
throws SQLException {
189233
super.doBegin(con, task, schema, taskCount);
234+
235+
SnowflakePluginTask pluginTask = (SnowflakePluginTask) task;
236+
SnowflakePluginTask.MatchByColumnName matchByColumnName = pluginTask.getMatchByColumnName();
237+
if (matchByColumnName == SnowflakePluginTask.MatchByColumnName.NONE) {
238+
pluginTask.setCopyIntoCSVColumnNumbers(new int[0]);
239+
pluginTask.setCopyIntoTableColumnNames(new String[0]);
240+
return;
241+
}
242+
243+
List<String> copyIntoTableColumnNames = new ArrayList<>();
244+
List<Integer> copyIntoCSVColumnNumbers = new ArrayList<>();
245+
JdbcSchema targetTableSchema = pluginTask.getTargetTableSchema();
246+
BiFunction<String, String, Boolean> compare =
247+
matchByColumnName == SnowflakePluginTask.MatchByColumnName.CASE_SENSITIVE
248+
? String::equals
249+
: String::equalsIgnoreCase;
250+
int columnNumber = 1;
251+
for (int i = 0; i < targetTableSchema.getCount(); i++) {
252+
JdbcColumn targetColumn = targetTableSchema.getColumn(i);
253+
if (targetColumn.isSkipColumn()) {
254+
continue;
255+
}
256+
Column schemaColumn = schema.getColumn(i);
257+
if (compare.apply(schemaColumn.getName(), targetColumn.getName())) {
258+
copyIntoTableColumnNames.add(targetColumn.getName());
259+
copyIntoCSVColumnNumbers.add(columnNumber);
260+
}
261+
columnNumber += 1;
262+
}
263+
pluginTask.setCopyIntoTableColumnNames(copyIntoTableColumnNames.toArray(new String[0]));
264+
pluginTask.setCopyIntoCSVColumnNumbers(
265+
copyIntoCSVColumnNumbers.stream().mapToInt(i -> i).toArray());
190266
}
191267

192268
@Override
@@ -201,6 +277,8 @@ protected BatchInsert newBatchInsert(PluginTask task, Optional<MergeConfig> merg
201277
return new SnowflakeCopyBatchInsert(
202278
getConnector(task, true),
203279
StageIdentifierHolder.getStageIdentifier(pluginTask),
280+
pluginTask.getCopyIntoTableColumnNames(),
281+
pluginTask.getCopyIntoCSVColumnNumbers(),
204282
false,
205283
pluginTask.getMaxUploadRetries(),
206284
pluginTask.getEmtpyFieldAsNull());

src/main/java/org/embulk/output/snowflake/SnowflakeCopyBatchInsert.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,15 @@ public class SnowflakeCopyBatchInsert implements BatchInsert {
4040
private List<Future<Void>> uploadAndCopyFutures;
4141
private boolean emptyFieldAsNull;
4242

43+
private String[] copyIntoTableColumnNames;
44+
45+
private int[] copyIntoCSVColumnNumbers;
46+
4347
public SnowflakeCopyBatchInsert(
4448
JdbcOutputConnector connector,
4549
StageIdentifier stageIdentifier,
50+
String[] copyIntoTableColumnNames,
51+
int[] copyIntoCSVColumnNumbers,
4652
boolean deleteStageFile,
4753
int maxUploadRetries,
4854
boolean emptyFieldAsNull)
@@ -51,6 +57,8 @@ public SnowflakeCopyBatchInsert(
5157
openNewFile();
5258
this.connector = connector;
5359
this.stageIdentifier = stageIdentifier;
60+
this.copyIntoTableColumnNames = copyIntoTableColumnNames;
61+
this.copyIntoCSVColumnNumbers = copyIntoCSVColumnNumbers;
5462
this.executorService = Executors.newCachedThreadPool();
5563
this.deleteStageFile = deleteStageFile;
5664
this.uploadAndCopyFutures = new ArrayList();
@@ -417,6 +425,8 @@ public Void call() throws SQLException, InterruptedException, ExecutionException
417425
tableIdentifier,
418426
stageIdentifier,
419427
snowflakeStageFileName,
428+
copyIntoTableColumnNames,
429+
copyIntoCSVColumnNumbers,
420430
delimiterString,
421431
emptyFieldAsNull);
422432

src/main/java/org/embulk/output/snowflake/SnowflakeOutputConnection.java

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,24 @@ public void runCopy(
2525
TableIdentifier tableIdentifier,
2626
StageIdentifier stageIdentifier,
2727
String filename,
28+
String[] tableColumnNames,
29+
int[] csvColumnNumbers,
2830
String delimiterString,
2931
boolean emptyFieldAsNull)
3032
throws SQLException {
3133
String sql =
32-
buildCopySQL(tableIdentifier, stageIdentifier, filename, delimiterString, emptyFieldAsNull);
34+
tableColumnNames != null && tableColumnNames.length > 0
35+
? buildCopySQL(
36+
tableIdentifier,
37+
stageIdentifier,
38+
filename,
39+
tableColumnNames,
40+
csvColumnNumbers,
41+
delimiterString,
42+
emptyFieldAsNull)
43+
: buildCopySQL(
44+
tableIdentifier, stageIdentifier, filename, delimiterString, emptyFieldAsNull);
45+
3346
runUpdate(sql);
3447
}
3548

@@ -196,6 +209,50 @@ protected String buildCopySQL(
196209
return sb.toString();
197210
}
198211

212+
protected String buildCopySQL(
213+
TableIdentifier tableIdentifier,
214+
StageIdentifier stageIdentifier,
215+
String snowflakeStageFileName,
216+
String[] tableColumnNames,
217+
int[] csvColumnNumbers,
218+
String delimiterString,
219+
boolean emptyFieldAsNull) {
220+
// Data load with transformation
221+
// Correspondence between CSV column numbers and table column names can be specified.
222+
// https://docs.snowflake.com/ja/sql-reference/sql/copy-into-table
223+
224+
StringBuilder sb = new StringBuilder();
225+
sb.append("COPY INTO ");
226+
quoteTableIdentifier(sb, tableIdentifier);
227+
sb.append(" (");
228+
for (int i = 0; i < tableColumnNames.length; i++) {
229+
if (i != 0) {
230+
sb.append(", ");
231+
}
232+
String column = quoteIdentifierString(tableColumnNames[i]);
233+
sb.append(column);
234+
}
235+
sb.append(" ) FROM ( SELECT ");
236+
for (int i = 0; i < csvColumnNumbers.length; i++) {
237+
if (i != 0) {
238+
sb.append(", ");
239+
}
240+
sb.append("t.$");
241+
sb.append(csvColumnNumbers[i]);
242+
}
243+
sb.append(" from ");
244+
quoteInternalStoragePath(sb, stageIdentifier, snowflakeStageFileName);
245+
sb.append(" t ) ");
246+
sb.append(" FILE_FORMAT = ( TYPE = CSV FIELD_DELIMITER = '");
247+
sb.append(delimiterString);
248+
sb.append("'");
249+
if (!emptyFieldAsNull) {
250+
sb.append(" EMPTY_FIELD_AS_NULL = FALSE");
251+
}
252+
sb.append(" );");
253+
return sb.toString();
254+
}
255+
199256
protected String buildDeleteStageFileSQL(
200257
StageIdentifier stageIdentifier, String snowflakeStageFileName) {
201258
StringBuilder sb = new StringBuilder();

0 commit comments

Comments
 (0)