diff --git "a/Governance/\317\200Flow_Open_Source_Individual_CLA.docx" "b/Governance/\317\200Flow_Open_Source_Individual_CLA.docx"
new file mode 100644
index 00000000..6d4a00eb
Binary files /dev/null and "b/Governance/\317\200Flow_Open_Source_Individual_CLA.docx" differ
diff --git "a/Governance/\317\200Flow_Open_Source_Individual_CLA.pdf" "b/Governance/\317\200Flow_Open_Source_Individual_CLA.pdf"
new file mode 100644
index 00000000..72fc2655
Binary files /dev/null and "b/Governance/\317\200Flow_Open_Source_Individual_CLA.pdf" differ
diff --git "a/Governance/\345\216\237\345\210\231.md" "b/Governance/\345\216\237\345\210\231.md"
index 9173e661..0b75cbb9 100644
--- "a/Governance/\345\216\237\345\210\231.md"
+++ "b/Governance/\345\216\237\345\210\231.md"
@@ -29,4 +29,4 @@ PifFow社区遵循[社区行为准则](https://github.com/cas-bigdatalab/piflow/
### CLA
-所有贡献者都必须签署PifFow CLA,请具体看[这里](https://github.com/cas-bigdatalab/piflow/blob/master/Governance/image-20211118094103884.png)。
+所有贡献者都必须签署PifFow CLA,请具体看[这里](https://github.com/cas-bigdatalab/piflow/blob/master/Governance/%CF%80Flow_Open_Source_Individual_CLA.docx)。
diff --git a/README.md b/README.md
index 837f886c..6342357d 100644
--- a/README.md
+++ b/README.md
@@ -39,11 +39,12 @@

## Requirements
* JDK 1.8
-* Scala-2.11.8
+* Scala-2.12.18
* Apache Maven 3.1.0 or newer
-* Spark-2.1.0、 Spark-2.2.0、 Spark-2.3.0
-* Hadoop-2.6.0
-* Apache Livy-0.7.1
+* Spark-3.4.0
+* Hadoop-3.3.0
+
+Compatible with X86 architecture and ARM architecture, Support CentOS and Kirin system deployment
## Getting Started
@@ -319,12 +320,20 @@

## Contact Us
-- Name:吴老师
-- Mobile Phone:18910263390
-- WeChat:18910263390
-- Email: wzs@cnic.cn
-- QQ Group:1003489545
- 
+- Name:Yang Gang, Tian Yao
+- Mobile Phone:13253365393, 18501260806
+- WeChat:13253365393, 18501260806
+- Email: ygang@cnic.cn, tianyao@cnic.cn
+- Private vulnerability contact information:ygang@cnic.cn
+- Wechat User Group
+
+
+
+
+- Wechat Official Account
+
+
+
diff --git "a/conda-pack\346\211\223\345\214\205\350\231\232\346\213\237\347\216\257\345\242\203.md" "b/conda-pack\346\211\223\345\214\205\350\231\232\346\213\237\347\216\257\345\242\203.md"
deleted file mode 100644
index 163b66e5..00000000
--- "a/conda-pack\346\211\223\345\214\205\350\231\232\346\213\237\347\216\257\345\242\203.md"
+++ /dev/null
@@ -1,55 +0,0 @@
-我使用的是conda这个包管理工具来对anaconda所安装的python虚拟环境进行打包,需要注意的是打包的是anaconda安装的虚拟环境而不是本地环境。
-
-优势:可以打包虚拟环境中包括二进制文件等整个环境包括pip安装的python库
-
-劣势:conda打包的虚拟环境只能使用于同一个操作系统下,测试了ubuntu和centos可以通用
-
-(操作系统ubuntu20.04)
-
-1.安装Anaconda(使用脚本安装)安装教程链接:https://www.myfreax.com/how-to-install-anaconda-on-ubuntu-20-04/ ,或者自行搜索安装方法。
-
-大致过程如下,下载脚本,执行脚本。
-
-```bash
-wget -P /tmp https://repo.anaconda.com/archive/Anaconda3-2020.02-Linux-x86_64.sh
-```
-
-安装anconda后建议更新下版本
-
-2.安装conda pack工具,这里推荐使用pip安装。
-
-```bash
-pip install conda-pack
-```
-
-3.创建python虚拟环境
-
-```
-conda create -n vir-name python=x.x. #vir-name换成你的虚拟环境名字
-```
-
-4.激活虚拟环境
-
-```
-conda activate vir-name
-```
-
-5.使用pip安装对应的包
-
-6.使用conda pack打包环境
-
-```
-conda pack -n my_env_name -o out_name.tar.gz
-```
-
-此处打包只能打包成tar.gz模式,打包成zip会有报错,想打包成zip模式解决办法就是先打包成tar.gz之后解压再重新打包成zip
-
-7.激活虚拟环境
-
-将zip包从本地环境上传到服务器或者其他操作系统相同的环境后解压并进入其中的bin目录
-
-使用`source activate`激活环境
-
-8.退出虚拟环境
-
-使用`source deactivate`退出虚拟环境
\ No newline at end of file
diff --git a/config.properties b/config.properties
index 49e059fa..ad5b42ad 100644
--- a/config.properties
+++ b/config.properties
@@ -1,13 +1,13 @@
spark.master=yarn
spark.deploy.mode=cluster
-
+server.ip=172.18.32.1
#hdfs default file system
-fs.defaultFS=hdfs://10.0.82.108:9000
+fs.defaultFS=hdfs://172.18.39.41:9000
#yarn resourcemanager hostname
-yarn.resourcemanager.hostname=10.0.82.108
+yarn.resourcemanager.hostname=172.18.39.41
#if you want to use hive, set hive metastore uris
-hive.metastore.uris=thrift://10.0.82.108:9083
+#hive.metastore.uris=thrift://10.0.82.108:9083
#show data in log, set 0 if you do not show the logs
data.show=10
diff --git a/doc/tencent.jpg b/doc/tencent.jpg
new file mode 100644
index 00000000..99e30380
Binary files /dev/null and b/doc/tencent.jpg differ
diff --git a/doc/wechat_user.png b/doc/wechat_user.png
new file mode 100644
index 00000000..ea6a68c2
Binary files /dev/null and b/doc/wechat_user.png differ
diff --git a/piflow-bin/config.properties b/piflow-bin/config.properties
index 3b6dc841..a6f167b7 100644
--- a/piflow-bin/config.properties
+++ b/piflow-bin/config.properties
@@ -2,10 +2,10 @@ spark.master=yarn
spark.deploy.mode=cluster
#hdfs default file system
-fs.defaultFS=hdfs://10.0.85.83:9000
+fs.defaultFS=hdfs://172.18.39.41:9000
#yarn resourcemanager hostname
-yarn.resourcemanager.hostname=10.0.85.83
+yarn.resourcemanager.hostname=172.18.39.41
#if you want to use hive, set hive metastore uris
hive.metastore.uris=thrift://10.0.85.83:9083
diff --git a/piflow-bin/example/flow.json b/piflow-bin/example/flow.json
index 6459cad1..eff0558f 100755
--- a/piflow-bin/example/flow.json
+++ b/piflow-bin/example/flow.json
@@ -7,45 +7,15 @@
"paths": [
{
"inport": "",
- "from": "XmlParser",
- "to": "SelectField",
- "outport": ""
- },
- {
- "inport": "",
- "from": "Fork",
+ "from": "CsvParser",
"to": "CsvSave",
- "outport": "out1"
- },
- {
- "inport": "data2",
- "from": "SelectField",
- "to": "Merge",
"outport": ""
},
{
"inport": "",
- "from": "Merge",
- "to": "Fork",
- "outport": ""
- },
- {
- "inport": "data1",
"from": "CsvParser",
- "to": "Merge",
+ "to": "CsvSave",
"outport": ""
- },
- {
- "inport": "",
- "from": "Fork",
- "to": "JsonSave",
- "outport": "out3"
- },
- {
- "inport": "",
- "from": "Fork",
- "to": "PutHiveMode",
- "outport": "out2"
}
],
"executorCores": "1",
@@ -56,7 +26,7 @@
"bundle": "cn.piflow.bundle.csv.CsvSave",
"uuid": "8a80d63f720cdd2301723a4e67a52467",
"properties": {
- "csvSavePath": "hdfs://master:9000/xjzhu/phdthesis_result.csv",
+ "csvSavePath": "hdfs://172.18.32.1:9000/user/Yomi/test1.csv",
"partition": "",
"header": "false",
"saveMode": "append",
@@ -66,87 +36,18 @@
}
},
- {
- "name": "PutHiveMode",
- "bundle": "cn.piflow.bundle.hive.PutHiveMode",
- "uuid": "8a80d63f720cdd2301723a4e67a22461",
- "properties": {
- "database": "sparktest",
- "saveMode": "append",
- "table": "dblp_phdthesis"
- },
- "customizedProperties": {
-
- }
- },
{
"name": "CsvParser",
"bundle": "cn.piflow.bundle.csv.CsvParser",
"uuid": "8a80d63f720cdd2301723a4e67a82470",
"properties": {
"schema": "title,author,pages",
- "csvPath": "hdfs://master:9000/xjzhu/phdthesis.csv",
+ "csvPath": "hdfs://172.18.32.1:9000/user/Yomi/test.csv",
"delimiter": ",",
"header": "false"
},
"customizedProperties": {
- }
- },
- {
- "name": "JsonSave",
- "bundle": "cn.piflow.bundle.json.JsonSave",
- "uuid": "8a80d63f720cdd2301723a4e67a1245f",
- "properties": {
- "jsonSavePath": "hdfs://10.0.86.191:9000/xjzhu/phdthesis.json"
- },
- "customizedProperties": {
-
- }
- },
- {
- "name": "XmlParser",
- "bundle": "cn.piflow.bundle.xml.XmlParser",
- "uuid": "8a80d63f720cdd2301723a4e67a7246d",
- "properties": {
- "rowTag": "phdthesis",
- "xmlpath": "hdfs://master:9000/xjzhu/dblp.mini.xml"
- },
- "customizedProperties": {
-
- }
- },
- {
- "name": "SelectField",
- "bundle": "cn.piflow.bundle.common.SelectField",
- "uuid": "8a80d63f720cdd2301723a4e67aa2477",
- "properties": {
- "columnNames": "title,author,pages"
- },
- "customizedProperties": {
-
- }
- },
- {
- "name": "Merge",
- "bundle": "cn.piflow.bundle.common.Merge",
- "uuid": "8a80d63f720cdd2301723a4e67a92475",
- "properties": {
- "inports": "data1,data2"
- },
- "customizedProperties": {
-
- }
- },
- {
- "name": "Fork",
- "bundle": "cn.piflow.bundle.common.Fork",
- "uuid": "8a80d63f720cdd2301723a4e67a42465",
- "properties": {
- "outports": "out1,out3,out2"
- },
- "customizedProperties": {
-
}
}
]
diff --git a/piflow-bin/example/flow_2.json b/piflow-bin/example/flow_2.json
new file mode 100644
index 00000000..6459cad1
--- /dev/null
+++ b/piflow-bin/example/flow_2.json
@@ -0,0 +1,154 @@
+{
+ "flow": {
+ "name": "Example",
+ "executorMemory": "1g",
+ "executorNumber": "1",
+ "uuid": "8a80d63f720cdd2301723a4e679e2457",
+ "paths": [
+ {
+ "inport": "",
+ "from": "XmlParser",
+ "to": "SelectField",
+ "outport": ""
+ },
+ {
+ "inport": "",
+ "from": "Fork",
+ "to": "CsvSave",
+ "outport": "out1"
+ },
+ {
+ "inport": "data2",
+ "from": "SelectField",
+ "to": "Merge",
+ "outport": ""
+ },
+ {
+ "inport": "",
+ "from": "Merge",
+ "to": "Fork",
+ "outport": ""
+ },
+ {
+ "inport": "data1",
+ "from": "CsvParser",
+ "to": "Merge",
+ "outport": ""
+ },
+ {
+ "inport": "",
+ "from": "Fork",
+ "to": "JsonSave",
+ "outport": "out3"
+ },
+ {
+ "inport": "",
+ "from": "Fork",
+ "to": "PutHiveMode",
+ "outport": "out2"
+ }
+ ],
+ "executorCores": "1",
+ "driverMemory": "1g",
+ "stops": [
+ {
+ "name": "CsvSave",
+ "bundle": "cn.piflow.bundle.csv.CsvSave",
+ "uuid": "8a80d63f720cdd2301723a4e67a52467",
+ "properties": {
+ "csvSavePath": "hdfs://master:9000/xjzhu/phdthesis_result.csv",
+ "partition": "",
+ "header": "false",
+ "saveMode": "append",
+ "delimiter": ","
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "PutHiveMode",
+ "bundle": "cn.piflow.bundle.hive.PutHiveMode",
+ "uuid": "8a80d63f720cdd2301723a4e67a22461",
+ "properties": {
+ "database": "sparktest",
+ "saveMode": "append",
+ "table": "dblp_phdthesis"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "CsvParser",
+ "bundle": "cn.piflow.bundle.csv.CsvParser",
+ "uuid": "8a80d63f720cdd2301723a4e67a82470",
+ "properties": {
+ "schema": "title,author,pages",
+ "csvPath": "hdfs://master:9000/xjzhu/phdthesis.csv",
+ "delimiter": ",",
+ "header": "false"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "JsonSave",
+ "bundle": "cn.piflow.bundle.json.JsonSave",
+ "uuid": "8a80d63f720cdd2301723a4e67a1245f",
+ "properties": {
+ "jsonSavePath": "hdfs://10.0.86.191:9000/xjzhu/phdthesis.json"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "XmlParser",
+ "bundle": "cn.piflow.bundle.xml.XmlParser",
+ "uuid": "8a80d63f720cdd2301723a4e67a7246d",
+ "properties": {
+ "rowTag": "phdthesis",
+ "xmlpath": "hdfs://master:9000/xjzhu/dblp.mini.xml"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "SelectField",
+ "bundle": "cn.piflow.bundle.common.SelectField",
+ "uuid": "8a80d63f720cdd2301723a4e67aa2477",
+ "properties": {
+ "columnNames": "title,author,pages"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "Merge",
+ "bundle": "cn.piflow.bundle.common.Merge",
+ "uuid": "8a80d63f720cdd2301723a4e67a92475",
+ "properties": {
+ "inports": "data1,data2"
+ },
+ "customizedProperties": {
+
+ }
+ },
+ {
+ "name": "Fork",
+ "bundle": "cn.piflow.bundle.common.Fork",
+ "uuid": "8a80d63f720cdd2301723a4e67a42465",
+ "properties": {
+ "outports": "out1,out3,out2"
+ },
+ "customizedProperties": {
+
+ }
+ }
+ ]
+ }
+}
diff --git a/piflow-bin/server.ip b/piflow-bin/server.ip
index 8f2cf70c..f32ec10e 100644
--- a/piflow-bin/server.ip
+++ b/piflow-bin/server.ip
@@ -1 +1 @@
-server.ip=10.0.85.83
+server.ip=172.18.32.1
diff --git a/piflow-bundle/config.properties b/piflow-bundle/config.properties
index 78495b0f..90776c21 100644
--- a/piflow-bundle/config.properties
+++ b/piflow-bundle/config.properties
@@ -2,9 +2,9 @@ spark.master=yarn
spark.deploy.mode=cluster
#hdfs default file system
-fs.defaultFS=hdfs://10.0.86.191:9000
+fs.defaultFS=hdfs://172.18.39.41:9000
#yarn resourcemanager hostname
-yarn.resourcemanager.hostname=10.0.86.191
+yarn.resourcemanager.hostname=172.18.39.41
#if you want to use hive, set hive metastore uris
hive.metastore.uris=thrift://10.0.86.191:9083
@@ -19,4 +19,10 @@ monitor.throughput=true
server.port=8001
#h2db port
-h2.port=50001
\ No newline at end of file
+h2.port=50001
+
+#ceph config
+ceph.accessKey=123456
+ceph.secretKey=123456
+ceph.bucket=*****
+ceph.domain.ip=xxxxxx(????????okhttp3.HttpUrl??)
\ No newline at end of file
diff --git a/piflow-bundle/pom.xml b/piflow-bundle/pom.xml
index 6a8ee7a4..c4912acd 100644
--- a/piflow-bundle/pom.xml
+++ b/piflow-bundle/pom.xml
@@ -11,41 +11,14 @@
UTF-8
9.0.0.M0
- 2.12.18
+ 2.11.8
1.8
- 0.3.1
+ 2.5.32
+ 10.1.12
piflow-bundle
-
-
-
- org.apache.hbase
- hbase-client
- 2.5.5
-
-
-
-
- org.apache.hbase
- hbase-mapreduce
- 2.5.5
-
-
-
-
- com.crealytics
- spark-excel_2.12
- 3.3.1_0.18.7
-
-
-
- org.elasticsearch
- elasticsearch-spark-30_2.12
- 8.3.3
-
-
ch.ethz.ganymed
ganymed-ssh2
@@ -53,17 +26,12 @@
- ru.yandex.clickhouse
- clickhouse-jdbc
- ${clickhouse.version}
-
-
- com.fasterxml.jackson.core
- *
-
-
+ com.alibaba
+ fastjson
+ 1.2.58
+
org.neo4j.driver
neo4j-java-driver
@@ -82,6 +50,19 @@
biojava-structure
4.0.0
+
+ org.apache.hive
+ hive-jdbc
+ 1.2.1
+
+
+ httpclient
+ org.apache.httpcomponents
+
+
+
org.mongodb
@@ -94,6 +75,12 @@
org.apache.solr
solr-solrj
7.2.0
+
+
+ httpclient
+ org.apache.httpcomponents
+
+
@@ -116,7 +103,7 @@
org.clapper
- classutil_2.12
+ classutil_2.11
1.3.0
@@ -128,41 +115,57 @@
com.chuusai
- shapeless_2.12
- 2.3.7
+ shapeless_2.11
+ 2.3.1
com.sksamuel.scrimage
- scrimage-core_2.12
- 2.1.8
+ scrimage-core_2.11
+ 2.1.7
com.sksamuel.scrimage
- scrimage-io-extra_2.12
- 2.1.8
+ scrimage-io-extra_2.11
+ 2.1.7
com.sksamuel.scrimage
- scrimage-filters_2.12
- 2.1.8
+ scrimage-filters_2.11
+ 2.1.7
+
+
+
+ org.slf4j
+ slf4j-api
+ 1.7.25
-
net.liftweb
- lift-json_2.12
- 3.3.0
+ lift-json_2.11
+ 2.6.1
com.databricks
- spark-xml_2.12
- 0.5.0
+ spark-xml_2.11
+ 0.4.1
+
+
+
black.ninia
jep
@@ -182,6 +185,19 @@
0.11.0.0
+
+ org.elasticsearch
+ elasticsearch-hadoop
+ 7.6.1
+
+
+
+
+ org.elasticsearch
+ elasticsearch
+ 7.6.1
+
+
org.jsoup
@@ -189,6 +205,7 @@
1.10.3
+
org.json
json
@@ -201,39 +218,65 @@
1.9.1
-
-
-
-
-
+
+ com.memcached
+ java_memcached-release
+ 2.6.6
+
-
+
+
- io.netty
- netty-all
- 4.1.89.Final
+ org.apache.flume
+ flume-ng-core
+ 1.8.0
-
+
+
+
+
+
+
+
org.apache.hbase
hbase-client
- 2.5.5-hadoop3
+ 1.2.6
+
+
+ httpclient
+ org.apache.httpcomponents
+
+
-
+
org.apache.hbase
hbase-server
- 2.5.5-hadoop3
+ 1.2.6
-
net.sourceforge.jexcelapi
jxl
2.6.12
+
+
+ org.apache.poi
+ poi-ooxml
+
+ 3.17
+
+
+ org.apache.xmlbeans
+ xmlbeans
+
+
+
+
net.sf.json-lib
@@ -247,12 +290,23 @@
commons-pool2
2.4.2
-
org.apache.commons
commons-lang3
3.5
+
+
+
+
+
+
+
+ ftpClient
+ edtftp
+ 1.0.0
+
+
@@ -264,37 +318,67 @@
org.apache.httpcomponents
httpclient
- 4.5.13
+ 4.5.3
org.apache.httpcomponents
httpmime
- 4.5.13
+ 4.5.3
-
+
- com.oracle.database.jdbc
+ oracle
ojdbc6
- 11.2.0.4
+ 11.2.0.3
-
+
+
- com.taosdata.jdbc
- taos-jdbcdriver
- 2.0.36
+ com.typesafe.akka
+ akka-remote_2.11
+ ${akka.version}
-
-
-
- io.hetu.core
- hetu-jdbc
- 1.6.0
-
+
+
+ com.typesafe.akka
+ akka-actor_2.11
+ ${akka.version}
+
+
+
+ com.typesafe.akka
+ akka-http_2.11
+ ${akka.http.version}
+
+
+
+ com.crealytics
+ spark-excel_2.11
+ 0.13.7
+
+
+
+ org.apache.commons
+ commons-collections4
+ 4.1
+
+
+
+
+ org.apache.xmlbeans
+ xmlbeans
+ 3.1.0
+
+
+
+
+
+
org.apache.maven.plugins
maven-install-plugin
@@ -315,6 +399,41 @@
true
+
+
+
+ install-external-2
+
+ install-file
+
+ install
+
+ ${basedir}/lib/ojdbc6-11.2.0.3.jar
+ oracle
+ ojdbc6
+ 11.2.0.3
+ jar
+ true
+
+
+
+
+ install-external-4
+
+ install-file
+
+ install
+
+ ${basedir}/lib/edtftpj.jar
+ ftpClient
+ edtftp
+ 1.0.0
+ jar
+ true
+
+
+
+
@@ -342,28 +461,37 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+ org.apache.maven.plugins
+ maven-install-plugin
+ 2.5.2
+
+
+ install-databricks
+ install-file
+ clean
+
+ ${basedir}/lib/spark-xml_2.11-0.4.2.jar
+ com.databricks
+ spark-xml_2.11
+ 0.4.1
+ jar
+ true
+
+
+
+
+
+
+
+
+ io.netty
+ netty-all
+ 4.1.68.Final
+
+
+
\ No newline at end of file
diff --git a/piflow-bundle/server.ip b/piflow-bundle/server.ip
index 39633ba6..3defccaa 100644
--- a/piflow-bundle/server.ip
+++ b/piflow-bundle/server.ip
@@ -1 +1 @@
-server.ip=10.0.85.83
\ No newline at end of file
+server.ip=172.18.32.1
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/flow/normalization/Discretization.json b/piflow-bundle/src/main/resources/flow/normalization/Discretization.json
new file mode 100644
index 00000000..f445a4ff
--- /dev/null
+++ b/piflow-bundle/src/main/resources/flow/normalization/Discretization.json
@@ -0,0 +1,38 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"0000",
+ "name":"SelectHiveQL",
+ "bundle":"cn.piflow.bundle.hive.SelectHiveQL",
+ "properties":{
+ "hiveQL":"select * from test.clean"
+ }
+ },
+ {
+ "uuid":"1111",
+ "name":"Discretization",
+ "bundle":"cn.piflow.bundle.normalization.Discretization",
+ "properties":{
+ "inputCol":"pre_normalization",
+ "outputCol":"finished_normalization",
+ "method": "EqualWidth",
+ "numBins": "5",
+ "k": "4"
+ }
+
+ }
+
+ ],
+ "paths":[
+ {
+ "from":"SelectHiveQL",
+ "outport":"",
+ "inport":"",
+ "to":"Discretization"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/flow/normalization/MaxMinNormalization.json b/piflow-bundle/src/main/resources/flow/normalization/MaxMinNormalization.json
new file mode 100644
index 00000000..83a6f085
--- /dev/null
+++ b/piflow-bundle/src/main/resources/flow/normalization/MaxMinNormalization.json
@@ -0,0 +1,35 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"0000",
+ "name":"SelectHiveQL",
+ "bundle":"cn.piflow.bundle.hive.SelectHiveQL",
+ "properties":{
+ "hiveQL":"select * from test.clean"
+ }
+ },
+ {
+ "uuid":"1111",
+ "name":"MaxMinNormalization",
+ "bundle":"cn.piflow.bundle.normalization.MaxMinNormalization",
+ "properties":{
+ "inputCol":"pre_normalization",
+ "outputCol":"finished_normalization"
+ }
+
+ }
+
+ ],
+ "paths":[
+ {
+ "from":"SelectHiveQL",
+ "outport":"",
+ "inport":"",
+ "to":"MaxMinNormalization"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/flow/normalization/ScopeNormalization.json b/piflow-bundle/src/main/resources/flow/normalization/ScopeNormalization.json
new file mode 100644
index 00000000..4d0dac12
--- /dev/null
+++ b/piflow-bundle/src/main/resources/flow/normalization/ScopeNormalization.json
@@ -0,0 +1,37 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"0000",
+ "name":"SelectHiveQL",
+ "bundle":"cn.piflow.bundle.hive.SelectHiveQL",
+ "properties":{
+ "hiveQL":"select * from test.clean"
+ }
+ },
+ {
+ "uuid":"1111",
+ "name":"ScopeNormalization",
+ "bundle":"cn.piflow.bundle.normalization.ScopeNormalization",
+ "properties":{
+ "inputCol":"pre_normalization",
+ "outputCol":"finished_normalization",
+ "range": "(0.0, 3.0)"
+
+ }
+
+ }
+
+ ],
+ "paths":[
+ {
+ "from":"SelectHiveQL",
+ "outport":"",
+ "inport":"",
+ "to":"ScopeNormalization"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/flow/normalization/ZScore.json b/piflow-bundle/src/main/resources/flow/normalization/ZScore.json
new file mode 100644
index 00000000..8a879b81
--- /dev/null
+++ b/piflow-bundle/src/main/resources/flow/normalization/ZScore.json
@@ -0,0 +1,35 @@
+{
+ "flow":{
+ "name":"test",
+ "uuid":"1234",
+ "stops":[
+ {
+ "uuid":"0000",
+ "name":"SelectHiveQL",
+ "bundle":"cn.piflow.bundle.hive.SelectHiveQL",
+ "properties":{
+ "hiveQL":"select * from test.clean"
+ }
+ },
+ {
+ "uuid":"1112",
+ "name":"ZScore",
+ "bundle":"cn.piflow.bundle.normalization.ZScore",
+ "properties":{
+ "inputCols":"pre_normalization",
+ "outputCols":"finished_normalization"
+ }
+
+ }
+
+ ],
+ "paths":[
+ {
+ "from":"SelectHiveQL",
+ "outport":"",
+ "inport":"",
+ "to":"ZScore"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/resources/flow/script/scala.json b/piflow-bundle/src/main/resources/flow/script/scala.json
index d18c5498..208edf49 100644
--- a/piflow-bundle/src/main/resources/flow/script/scala.json
+++ b/piflow-bundle/src/main/resources/flow/script/scala.json
@@ -20,7 +20,7 @@
"bundle":"cn.piflow.bundle.script.ExecuteScalaFile",
"properties":{
"plugin": "ScalaTest_ExecuteScalaFile_123123123",
- "script":" val df = in.read()\n df.show()\n df.createOrReplaceTempView(\"people\")\n val df1 = spark.sql(\"select * from people where author like '%xjzhu%'\")\n out.write(df1)"
+ "script":" val df = in.read().getSparkDf\n df.show()\n df.createOrReplaceTempView(\"people\")\n val df1 = spark.sql(\"select * from people where author like '%xjzhu%'\")\n out.write(df1)"
}
},
{
diff --git a/piflow-bundle/src/main/resources/icon/ceph.png b/piflow-bundle/src/main/resources/icon/ceph.png
new file mode 100644
index 00000000..0ef6b557
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/ceph.png differ
diff --git a/piflow-bundle/src/main/resources/icon/ceph/ceph.png b/piflow-bundle/src/main/resources/icon/ceph/ceph.png
new file mode 100644
index 00000000..bd877694
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/ceph/ceph.png differ
diff --git a/piflow-bundle/src/main/resources/icon/jdbc/dameng.png b/piflow-bundle/src/main/resources/icon/jdbc/dameng.png
new file mode 100644
index 00000000..5a64c1f9
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/jdbc/dameng.png differ
diff --git a/piflow-bundle/src/main/resources/icon/jdbc/tbase.png b/piflow-bundle/src/main/resources/icon/jdbc/tbase.png
index 0dc907b8..671477c1 100644
Binary files a/piflow-bundle/src/main/resources/icon/jdbc/tbase.png and b/piflow-bundle/src/main/resources/icon/jdbc/tbase.png differ
diff --git a/piflow-bundle/src/main/resources/icon/normalization/DiscretizationNormalization.png b/piflow-bundle/src/main/resources/icon/normalization/DiscretizationNormalization.png
new file mode 100644
index 00000000..7c62193a
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/normalization/DiscretizationNormalization.png differ
diff --git a/piflow-bundle/src/main/resources/icon/normalization/MaxMinNormalization.png b/piflow-bundle/src/main/resources/icon/normalization/MaxMinNormalization.png
new file mode 100644
index 00000000..9a9511e8
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/normalization/MaxMinNormalization.png differ
diff --git a/piflow-bundle/src/main/resources/icon/normalization/ScopeNormalization.png b/piflow-bundle/src/main/resources/icon/normalization/ScopeNormalization.png
new file mode 100644
index 00000000..0fc1b6aa
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/normalization/ScopeNormalization.png differ
diff --git a/piflow-bundle/src/main/resources/icon/normalization/ZScoreNormalization.png b/piflow-bundle/src/main/resources/icon/normalization/ZScoreNormalization.png
new file mode 100644
index 00000000..1c9b85e4
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/normalization/ZScoreNormalization.png differ
diff --git a/piflow-bundle/src/main/resources/icon/unstructured/DocxParser.png b/piflow-bundle/src/main/resources/icon/unstructured/DocxParser.png
new file mode 100644
index 00000000..5a7b42a4
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/unstructured/DocxParser.png differ
diff --git a/piflow-bundle/src/main/resources/icon/unstructured/HtmlParser.png b/piflow-bundle/src/main/resources/icon/unstructured/HtmlParser.png
new file mode 100644
index 00000000..ea4c3df4
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/unstructured/HtmlParser.png differ
diff --git a/piflow-bundle/src/main/resources/icon/unstructured/ImageParser.png b/piflow-bundle/src/main/resources/icon/unstructured/ImageParser.png
new file mode 100644
index 00000000..f9f63b0a
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/unstructured/ImageParser.png differ
diff --git a/piflow-bundle/src/main/resources/icon/unstructured/PdfParser.png b/piflow-bundle/src/main/resources/icon/unstructured/PdfParser.png
new file mode 100644
index 00000000..8ee74b9c
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/unstructured/PdfParser.png differ
diff --git a/piflow-bundle/src/main/resources/icon/unstructured/PptxParser.png b/piflow-bundle/src/main/resources/icon/unstructured/PptxParser.png
new file mode 100644
index 00000000..c9186149
Binary files /dev/null and b/piflow-bundle/src/main/resources/icon/unstructured/PptxParser.png differ
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineRead.scala
index 9fce6871..70720ab0 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineRead.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,7 @@ class TDengineRead extends ConfigurableStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineWrite.scala
index df90de0c..bbb22220 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/TDengine/TDengineWrite.scala
@@ -29,7 +29,7 @@ class TDengineWrite extends ConfigurableStop{
properties.put("user", user)
properties.put("password", password)
properties.put("driver",driver)
- val df = in.read()
+ val df = in.read().getSparkDf
df.write.mode(SaveMode.Append).jdbc(url,dbtable,properties)
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/arrowflight/ArrowFlightOut.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/arrowflight/ArrowFlightOut.scala
new file mode 100644
index 00000000..b561c896
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/arrowflight/ArrowFlightOut.scala
@@ -0,0 +1,219 @@
+package cn.piflow.bundle.arrowflight
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf._
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.execution.arrow.ArrowConverters
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.util.ArrowUtils
+import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.ipc.{ArrowFileWriter, WriteChannel}
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, Float8Vector, IntVector, ValueVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector.util.VectorBatchAppender
+
+import java.io.{File, FileOutputStream}
+import java.net.{ServerSocket, Socket}
+import java.nio.channels.Channels
+import scala.collection.JavaConverters._
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.types.{FloatingPointPrecision, TimeUnit}
+import org.apache.spark.sql.types._
+
+import java.nio.charset.StandardCharsets
+
+
+class ArrowFlightOut extends ConfigurableStop{
+ val authorEmail: String = "zjliang@cnic.cn"
+ val description: String = "Output the data as arrow file format."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var outputIp: String = _
+// var header: Boolean = _
+// var delimiter: String = _
+// var partition :String= _
+// var saveMode:String = _
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ outputIp = MapUtil.get(map,"outputIp").asInstanceOf[String]
+// header = MapUtil.get(map,"header").asInstanceOf[String].toBoolean
+// delimiter = MapUtil.get(map,"delimiter").asInstanceOf[String]
+// partition = MapUtil.get(map,key="partition").asInstanceOf[String]
+// saveMode = MapUtil.get(map,"saveMode").asInstanceOf[String]
+
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+
+// val saveModeOption = Set("append","overwrite","error","ignore")
+ var descriptor : List[PropertyDescriptor] = List()
+
+ val outputIp = new PropertyDescriptor()
+ .name("outputIp")
+ .displayName("outputIp")
+ .description("The output ip of file")
+ .defaultValue("")
+ .required(true)
+ .example("127.0.0.1")
+ descriptor = outputIp :: descriptor
+
+// val header = new PropertyDescriptor()
+// .name("header")
+// .displayName("Header")
+// .description("Whether the csv file has a header")
+// .allowableValues(Set("true","false"))
+// .defaultValue("false")
+// .required(true)
+// .example("false")
+// descriptor = header :: descriptor
+//
+// val delimiter = new PropertyDescriptor()
+// .name("delimiter")
+// .displayName("Delimiter")
+// .description("The delimiter of csv file")
+// .defaultValue(",")
+// .required(true)
+// .example(",")
+// descriptor = delimiter :: descriptor
+//
+// val partition = new PropertyDescriptor()
+// .name("partition")
+// .displayName("Partition")
+// .description("The partition of csv file,you can specify the number of partitions saved as csv or not")
+// .defaultValue("")
+// .required(false)
+// .example("3")
+// descriptor = partition :: descriptor
+//
+// val saveMode = new PropertyDescriptor()
+// .name("saveMode")
+// .displayName("SaveMode")
+// .description("The save mode for csv file")
+// .allowableValues(saveModeOption)
+// .defaultValue("append")
+// .required(true)
+// .example("append")
+// descriptor = saveMode :: descriptor
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/csv/CsvSave.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.FlightGroup)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+ def sparkTypeToArrowType(dataType: DataType): ArrowType = dataType match {
+ case IntegerType => new ArrowType.Int(32, true)
+ case LongType => new ArrowType.Int(64, true)
+ case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
+ case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+ case StringType => new ArrowType.Utf8()
+ case BooleanType => ArrowType.Bool.INSTANCE
+ case BinaryType => ArrowType.Binary.INSTANCE
+ case TimestampType => new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)
+ case _ => throw new UnsupportedOperationException(s"Unsupported type: $dataType")
+ }
+
+ def toArrowSchema(schema: StructType): Schema = {
+ val fields = schema.fields.map { field =>
+ new Field(
+ field.name,
+ FieldType.nullable(sparkTypeToArrowType(field.dataType)),
+ null
+ )
+ }.toList
+ new Schema(fields.asJava)
+ }
+
+ type FieldProcessor = (Int, Any) => Unit
+ private def createFieldProcessor(sparkType: DataType, vector: ValueVector): FieldProcessor =
+ (sparkType, vector) match {
+ // Int 类型 (Integer/Numeric)
+ case (_: IntegerType, vec: IntVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Int])
+ // 字符串类型
+ case (_: StringType, vec: VarCharVector) => (rowIdx, value) =>
+ if (value == null) {
+ vec.setNull(rowIdx)
+ } else {
+ val strValue = value.toString
+ val bytes = strValue.getBytes(StandardCharsets.UTF_8)
+ vec.setSafe(rowIdx, bytes, 0, bytes.length)
+ }
+ // Double 类型
+ case (_: DoubleType, vec: Float8Vector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Double])
+ // Long 类型
+ case (_: LongType, vec: BigIntVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Long])
+ // Boolean 类型(使用 BitVector)
+ case (_: BooleanType, vec: BitVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, if (value.asInstanceOf[Boolean]) 1 else 0)
+ // Date类型(示例)
+ case (_: DateType, vec: DateDayVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Int]) // 需根据实际日期格式转换
+ case _ => throw new IllegalArgumentException(
+ s"Unsupported type combination: SparkType=$sparkType, VectorType=${vector.getClass}"
+ )
+ }
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val df = in.read().getSparkDf
+
+ val allocator = new RootAllocator(Long.MaxValue)
+ val arrowSchema = toArrowSchema(df.schema)
+ val root = VectorSchemaRoot.create(arrowSchema, allocator)
+
+ val serverSocket = new ServerSocket(9090)
+
+ println("Server is listening on port 9090")
+
+
+ try {
+ root.allocateNew()
+ // 创建类型映射的字段处理器
+ val fieldProcessors = df.schema.zipWithIndex.map { case (field, idx) =>
+ createFieldProcessor(field.dataType, root.getVector(idx)) // 动态绑定对应 Vector 类型
+ }
+ // 逐行处理数据
+ val rows = df.collect().toList
+ root.setRowCount(rows.size)
+ for {
+ (row, rowIndex) <- rows.zipWithIndex
+ (value, processor) <- row.toSeq.zip(fieldProcessors)
+ } {
+ processor(rowIndex, value) // 类型安全地写入数据
+ }
+ val socket: Socket = serverSocket.accept()
+ val writer = new ArrowFileWriter(root, null, Channels.newChannel(socket.getOutputStream))
+ try {
+
+ writer.start()
+ writer.writeBatch()
+ writer.end()
+ } finally {
+ writer.close()
+ socket.close()
+ }
+ } finally {
+ root.close()
+ allocator.close()
+ }
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/asr/ChineseSpeechRecognition.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/asr/ChineseSpeechRecognition.scala
index 46263250..3a7a8760 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/asr/ChineseSpeechRecognition.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/asr/ChineseSpeechRecognition.scala
@@ -1,11 +1,11 @@
package cn.piflow.bundle.asr
import java.io.{File, FileNotFoundException}
-
import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.http.entity.ContentType
import org.apache.http.util.EntityUtils
import org.apache.spark.rdd.RDD
@@ -90,7 +90,7 @@ class ChineseSpeechRecognition extends ConfigurableStop {
))
val df: DataFrame = session.createDataFrame(rowRDD,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephRead.scala
new file mode 100644
index 00000000..997392fb
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephRead.scala
@@ -0,0 +1,158 @@
+package cn.piflow.bundle.ceph
+
+import cn.piflow._
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+class CephRead extends ConfigurableStop {
+
+ val authorEmail: String = "niuzj@gmqil.com"
+ val description: String = "Read data from ceph"
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var cephAccessKey:String = _
+ var cephSecretKey:String = _
+ var cephEndpoint:String = _
+ var types: String = _
+ var path: String = _
+ var header: Boolean = _
+ var delimiter: String = _
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ spark.conf.set("fs.s3a.access.key", cephAccessKey)
+ spark.conf.set("fs.s3a.secret.key", cephSecretKey)
+ spark.conf.set("fs.s3a.endpoint", cephEndpoint)
+ spark.conf.set("fs.s3a.connection.ssl.enabled", "false")
+
+ var df:DataFrame = null
+
+ if (types == "parquet") {
+ df = spark.read
+ .parquet(path)
+ }
+
+ if (types == "csv") {
+
+ df = spark.read
+ .option("header", header)
+ .option("inferSchema", "true")
+ .option("delimiter", delimiter)
+ .csv(path)
+ }
+
+ if (types == "json") {
+ df = spark.read
+ .json(path)
+ }
+
+ out.write(new SciDataFrame(df))
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ cephAccessKey = MapUtil.get(map,"cephAccessKey").asInstanceOf[String]
+ cephSecretKey = MapUtil.get(map, "cephSecretKey").asInstanceOf[String]
+ cephEndpoint = MapUtil.get(map,"cephEndpoint").asInstanceOf[String]
+ types = MapUtil.get(map,"types").asInstanceOf[String]
+ path = MapUtil.get(map,"path").asInstanceOf[String]
+ header = MapUtil.get(map, "header").asInstanceOf[String].toBoolean
+ delimiter = MapUtil.get(map, "delimiter").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+
+ var descriptor : List[PropertyDescriptor] = List()
+
+ val cephAccessKey=new PropertyDescriptor()
+ .name("cephAccessKey")
+ .displayName("cephAccessKey")
+ .description("This parameter is of type String and represents the access key used to authenticate with the Ceph storage system.")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = cephAccessKey :: descriptor
+
+ val cephSecretKey=new PropertyDescriptor()
+ .name("cephSecretKey")
+ .displayName("cephSecretKey")
+ .description("This parameter is of type String and represents the secret key used to authenticate with the Ceph storage system")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = cephSecretKey :: descriptor
+
+
+
+ val cephEndpoint = new PropertyDescriptor()
+ .name("cephEndpoint")
+ .displayName("cephEndpoint")
+ .description("This parameter is of type String and represents the endpoint URL of the Ceph storage system. It is used to establish a connection with the Ceph cluster")
+ .defaultValue("")
+ .required(true)
+ .example("http://cephcluster:7480")
+ .sensitive(true)
+ descriptor = cephEndpoint :: descriptor
+
+ val types = new PropertyDescriptor()
+ .name("types")
+ .displayName("Types")
+ .description("The format you want to write is json,csv,parquet")
+ .defaultValue("csv")
+ .allowableValues(Set("json", "csv", "parquet"))
+ .required(true)
+ .example("csv")
+ descriptor = types :: descriptor
+
+ val header = new PropertyDescriptor()
+ .name("header")
+ .displayName("Header")
+ .description("Whether the csv file has a header")
+ .defaultValue("false")
+ .allowableValues(Set("true", "false"))
+ .required(true)
+ .example("true")
+ descriptor = header :: descriptor
+
+ val delimiter = new PropertyDescriptor()
+ .name("delimiter")
+ .displayName("Delimiter")
+ .description("The delimiter of csv file")
+ .defaultValue("")
+ .required(true)
+ .example(",")
+ descriptor = delimiter :: descriptor
+
+
+ val path = new PropertyDescriptor()
+ .name("path")
+ .displayName("Path")
+ .description("The file path you want to write to")
+ .defaultValue("")
+ .required(true)
+ .example("s3a://radosgw-test/test_df")
+ descriptor = path :: descriptor
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/ceph/ceph.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.CephGroup)
+ }
+
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephWrite.scala
new file mode 100644
index 00000000..9fd5910a
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ceph/CephWrite.scala
@@ -0,0 +1,162 @@
+package cn.piflow.bundle.ceph
+
+import cn.piflow._
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil,MapUtil}
+import org.apache.spark.sql.SparkSession
+
+
+class CephWrite extends ConfigurableStop {
+
+
+ val authorEmail: String = "niuzj@gmqil.com"
+ val description: String = "Read data from ceph"
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var cephAccessKey:String = _
+ var cephSecretKey:String = _
+ var cephEndpoint:String = _
+ var types: String = _
+ var path:String = _
+ var header: Boolean = _
+ var delimiter: String = _
+
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ spark.conf.set("fs.s3a.access.key", cephAccessKey)
+ spark.conf.set("fs.s3a.secret.key", cephSecretKey)
+ spark.conf.set("fs.s3a.endpoint", cephEndpoint)
+ spark.conf.set("fs.s3a.connection.ssl.enabled", "false")
+
+ // Create a DataFrame from the data
+ val df = in.read().getSparkDf
+
+ if (types == "parquet") {
+ df.write
+ .format("parquet")
+ .mode("overwrite") // only overwrite
+ .save(path)
+ }
+
+ if (types == "csv") {
+ df.write
+ .format("csv")
+ .option("header", header)
+ .option("delimiter",delimiter)
+ .mode("overwrite")
+ .save(path)
+ }
+
+ if (types == "json") {
+ df.write
+ .format("json")
+ .mode("overwrite")
+ .save(path)
+ }
+
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ cephAccessKey = MapUtil.get(map, "cephAccessKey").asInstanceOf[String]
+ cephSecretKey = MapUtil.get(map, "cephSecretKey").asInstanceOf[String]
+ cephEndpoint = MapUtil.get(map, "cephEndpoint").asInstanceOf[String]
+ types = MapUtil.get(map, "types").asInstanceOf[String]
+ path = MapUtil.get(map, "path").asInstanceOf[String]
+ header = MapUtil.get(map, "header").asInstanceOf[String].toBoolean
+ delimiter = MapUtil.get(map, "delimiter").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+
+ var descriptor : List[PropertyDescriptor] = List()
+
+ val cephAccessKey=new PropertyDescriptor()
+ .name("cephAccessKey")
+ .displayName("cephAccessKey")
+ .description("This parameter is of type String and represents the access key used to authenticate with the Ceph storage system.")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = cephAccessKey :: descriptor
+
+ val cephSecretKey=new PropertyDescriptor()
+ .name("cephSecretKey")
+ .displayName("cephSecretKey")
+ .description("This parameter is of type String and represents the secret key used to authenticate with the Ceph storage system")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = cephSecretKey :: descriptor
+
+ val cephEndpoint = new PropertyDescriptor()
+ .name("cephEndpoint")
+ .displayName("cephEndpoint")
+ .description("This parameter is of type String and represents the endpoint URL of the Ceph storage system. It is used to establish a connection with the Ceph cluster")
+ .defaultValue("")
+ .required(true)
+ .example("http://cephcluster:7480")
+ .sensitive(true)
+ descriptor = cephEndpoint :: descriptor
+
+ val types = new PropertyDescriptor()
+ .name("types")
+ .displayName("Types")
+ .description("The format you want to write is json,csv,parquet")
+ .defaultValue("csv")
+ .allowableValues(Set("json", "csv", "parquet"))
+ .required(true)
+ .example("csv")
+ descriptor = types :: descriptor
+
+ val delimiter = new PropertyDescriptor()
+ .name("delimiter")
+ .displayName("Delimiter")
+ .description("The delimiter of csv file")
+ .defaultValue(",")
+ .required(true)
+ .example(",")
+ descriptor = delimiter :: descriptor
+
+
+ val header = new PropertyDescriptor()
+ .name("header")
+ .displayName("Header")
+ .description("Whether the csv file has a header")
+ .defaultValue("true")
+ .allowableValues(Set("true", "false"))
+ .required(true)
+ .example("true")
+ descriptor = header :: descriptor
+
+
+ val path= new PropertyDescriptor()
+ .name("path")
+ .displayName("Path")
+ .description("The file path you want to write to")
+ .defaultValue("s3a://radosgw-test/test_df")
+ .required(true)
+ .example("s3a://radosgw-test/test_df")
+ descriptor = path :: descriptor
+
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/ceph/ceph.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.CephGroup)
+ }
+
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/EmailClean.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/EmailClean.scala
index 43ac6831..6f006e5e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/EmailClean.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/EmailClean.scala
@@ -5,6 +5,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
class EmailClean extends ConfigurableStop{
@@ -19,7 +20,7 @@ class EmailClean extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processEmail(str))
val structFields: Array[String] = dfOld.schema.fieldNames
@@ -51,7 +52,7 @@ class EmailClean extends ConfigurableStop{
val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
val dfNew1=sqlContext.sql(sqlTextNew)
- out.write(dfNew1)
+ out.write(new SciDataFrame(dfNew1))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/IdentityNumberClean.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/IdentityNumberClean.scala
index 2d2c933b..bc2c45de 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/IdentityNumberClean.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/IdentityNumberClean.scala
@@ -6,6 +6,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -22,7 +23,7 @@ class IdentityNumberClean extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processCardCode(str))
val structFields: Array[String] = dfOld.schema.fieldNames
@@ -54,7 +55,7 @@ class IdentityNumberClean extends ConfigurableStop{
val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
val dfNew1=sqlContext.sql(sqlTextNew)
- out.write(dfNew1)
+ out.write(new SciDataFrame(dfNew1))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/PhoneNumberClean.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/PhoneNumberClean.scala
index d0a65d38..1895f2de 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/PhoneNumberClean.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/PhoneNumberClean.scala
@@ -5,6 +5,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructField
@@ -20,7 +21,7 @@ class PhoneNumberClean extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processPhonenum(str))
val structFields: Array[String] = dfOld.schema.fieldNames
@@ -52,7 +53,7 @@ class PhoneNumberClean extends ConfigurableStop{
val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
val dfNew1=sqlContext.sql(sqlTextNew)
- out.write(dfNew1)
+ out.write(new SciDataFrame(dfNew1))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/ProvinceClean.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/ProvinceClean.scala
index 09188341..9593d32e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/ProvinceClean.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/ProvinceClean.scala
@@ -4,6 +4,7 @@ import cn.piflow.bundle.util.CleanUtil
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
@@ -18,7 +19,7 @@ class ProvinceClean extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processProvince(str))
val structFields: Array[String] = dfOld.schema.fieldNames
@@ -50,7 +51,7 @@ class ProvinceClean extends ConfigurableStop{
val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
val dfNew1=sqlContext.sql(sqlTextNew)
- out.write(dfNew1)
+ out.write(new SciDataFrame(dfNew1))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/TitleClean.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/TitleClean.scala
index a176980f..e1145bdf 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/TitleClean.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clean/TitleClean.scala
@@ -5,6 +5,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructField
@@ -19,7 +20,7 @@ class TitleClean extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>CleanUtil.processTitle(str))
val structFields: Array[String] = dfOld.schema.fieldNames
@@ -51,7 +52,7 @@ class TitleClean extends ConfigurableStop{
val sqlTextNew:String = "select " + schemaStr.substring(0,schemaStr.length -1) + " from thesis"
val dfNew1=sqlContext.sql(sqlTextNew)
- out.write(dfNew1)
+ out.write(new SciDataFrame(dfNew1))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseRead.scala
index ca8aa3cd..53d8d69b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseRead.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.clickhouse
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -36,7 +37,7 @@ class ClickhouseRead extends ConfigurableStop {
.options(options)
.load()
jdbcDF.show()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseWrite.scala
index aff3c031..b64575bf 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/clickhouse/ClickhouseWrite.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{DataFrame, SaveMode}
import java.util.Properties
@@ -22,7 +23,7 @@ class ClickhouseWrite extends ConfigurableStop{
var dbtable:String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val jdbcDF: DataFrame = in.read()
+ val jdbcDF: DataFrame = in.read().getSparkDf
val properties: Properties = new Properties()
properties.put("driver", driver)
if (user != null && user.nonEmpty) {
@@ -35,7 +36,7 @@ class ClickhouseWrite extends ConfigurableStop{
"numPartitions" -> "1"
)
jdbcDF.write.mode(SaveMode.Append).options(options).jdbc(url, dbtable, properties)
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/AddUUIDStop.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/AddUUIDStop.scala
index d01c3756..c1c89066 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/AddUUIDStop.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/AddUUIDStop.scala
@@ -1,10 +1,10 @@
package cn.piflow.bundle.common
import java.util.UUID
-
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -19,14 +19,14 @@ class AddUUIDStop extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- var df = in.read()
+ var df = in.read().getSparkDf
spark.udf.register("generateUUID",()=>UUID.randomUUID().toString.replace("-",""))
df.createOrReplaceTempView("temp")
df = spark.sql(s"select generateUUID() as ${column},* from temp")
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ConvertSchema.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ConvertSchema.scala
index a8fb9493..c1fa16f6 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ConvertSchema.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ConvertSchema.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
class ConvertSchema extends ConfigurableStop {
@@ -15,7 +16,7 @@ class ConvertSchema extends ConfigurableStop {
var schema:String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- var df = in.read()
+ var df = in.read().getSparkDf
val field = schema.split(",").map(x => x.trim)
@@ -24,7 +25,7 @@ class ConvertSchema extends ConfigurableStop {
df = df.withColumnRenamed(old_new(0),old_new(1))
})
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Distinct.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Distinct.scala
index 38e493e3..bd6a2d80 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Distinct.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Distinct.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.DataFrame
@@ -47,7 +48,7 @@ class Distinct extends ConfigurableStop{
}
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val inDf: DataFrame = in.read()
+ val inDf: DataFrame = in.read().getSparkDf
var outDf: DataFrame = null
if(columnNames.length > 0){
val fileArr: Array[String] = columnNames.split(",")
@@ -55,6 +56,6 @@ class Distinct extends ConfigurableStop{
}else{
outDf = inDf.distinct()
}
- out.write(outDf)
+ out.write(new SciDataFrame(outDf))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/DropField.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/DropField.scala
index 1378af71..19001cdd 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/DropField.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/DropField.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
class DropField extends ConfigurableStop {
@@ -16,14 +17,14 @@ class DropField extends ConfigurableStop {
var columnNames:String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- var df = in.read()
+ var df = in.read().getSparkDf
val field = columnNames.split(",").map(x => x.trim)
for( x <- 0 until field.size){
df = df.drop(field(x))
}
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ExecuteSQLStop.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ExecuteSQLStop.scala
index ac128fd8..bfda3185 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ExecuteSQLStop.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/ExecuteSQLStop.scala
@@ -7,6 +7,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.lib._
import cn.piflow.lib.io.{FileFormat, TextFile}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{DataFrame, SparkSession}
class ExecuteSQLStop extends ConfigurableStop{
@@ -23,11 +24,11 @@ class ExecuteSQLStop extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
inDF.createOrReplaceTempView(ViewName)
val frame: DataFrame = spark.sql(sql)
- out.write(frame)
+ out.write(new SciDataFrame(frame))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Filter.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Filter.scala
index fe2b164f..755270fb 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Filter.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Filter.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{Column, DataFrame}
class Filter extends ConfigurableStop{
@@ -45,10 +46,10 @@ class Filter extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read()
+ val df = in.read().getSparkDf
var filterDF : DataFrame = df.filter(condition)
- out.write(filterDF)
+ out.write(new SciDataFrame(filterDF))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Fork.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Fork.scala
index 279e67e8..3f11cc93 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Fork.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Fork.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
@@ -26,8 +27,8 @@ class Fork extends ConfigurableStop{
}
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read().cache()
- outports.foreach(out.write(_, df));
+ val df = in.read().getSparkDf.cache()
+ outports.foreach(out.write(_, new SciDataFrame(df)));
}
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Join.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Join.scala
index 6dffbfaa..7f1c2002 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Join.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Join.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.{Column, DataFrame}
@@ -17,8 +18,8 @@ class Join extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val leftDF = in.read(Port.LeftPort)
- val rightDF = in.read(Port.RightPort)
+ val leftDF = in.read(Port.LeftPort).getSparkDf
+ val rightDF = in.read(Port.RightPort).getSparkDf
var seq: Seq[String]= Seq()
correlationColumn.split(",").foreach(x=>{
@@ -32,7 +33,7 @@ class Join extends ConfigurableStop{
case "right" => df = leftDF.join(rightDF,seq,"right_outer")
case "full_outer" => df = leftDF.join(rightDF,seq,"outer")
}
- out.write(df)
+ out.write(new SciDataFrame(df))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Merge.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Merge.scala
index e5be9ead..bbb107f6 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Merge.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Merge.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
class Merge extends ConfigurableStop{
@@ -15,7 +16,7 @@ class Merge extends ConfigurableStop{
var inports : List[String] = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- out.write(in.ports().map(in.read(_)).reduce((x, y) => x.union(y)));
+ out.write(new SciDataFrame(in.ports().map(in.read(_).getSparkDf).reduce((x, y) => x.union(y))));
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/MockData.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/MockData.scala
index e3aa4a94..396a632f 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/MockData.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/MockData.scala
@@ -5,6 +5,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types._
import org.json4s
@@ -91,7 +92,7 @@ class MockData extends ConfigurableStop{
val schemaStructType = StructType(structFieldArray)
val rnd : Random = new Random()
val df = spark.read.schema(schemaStructType).json((0 to count -1 ).map{ _ => compact(randomJson(rnd,schemaStructType))}.toDS())
- out.write(df)
+ out.write(new SciDataFrame(df))
}
private def randomJson( rnd: Random, dataType : DataType): JValue ={
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Route.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Route.scala
index 955a73d6..36028a5d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Route.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Route.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
class Route extends ConfigurableStop{
@@ -23,7 +24,7 @@ class Route extends ConfigurableStop{
}
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read().cache()
+ val df = in.read().getSparkDf.cache()
if(this.customizedProperties != null || this.customizedProperties.size != 0){
val it = this.customizedProperties.keySet.iterator
@@ -31,10 +32,10 @@ class Route extends ConfigurableStop{
val port = it.next()
val filterCondition = MapUtil.get(this.customizedProperties,port).asInstanceOf[String]
val filterDf = df.filter(filterCondition)
- out.write(port,filterDf)
+ out.write(port,new SciDataFrame(filterDf))
}
}
- out.write(df);
+ out.write(new SciDataFrame(df));
}
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/SelectField.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/SelectField.scala
index b45cfe54..47b8f9fe 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/SelectField.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/SelectField.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{Column, DataFrame}
import scala.beans.BeanProperty
@@ -19,7 +20,7 @@ class SelectField extends ConfigurableStop {
var columnNames:String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read()
+ val df = in.read().getSparkDf
val field = columnNames.split(",").map(x => x.trim)
val columnArray : Array[Column] = new Array[Column](field.size)
@@ -28,7 +29,7 @@ class SelectField extends ConfigurableStop {
}
var finalFieldDF : DataFrame = df.select(columnArray:_*)
- out.write(finalFieldDF)
+ out.write(new SciDataFrame(finalFieldDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Subtract.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Subtract.scala
index 1f99f91f..f2ca25be 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Subtract.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/common/Subtract.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.common
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.ImageUtil
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.sql.types.StructType
@@ -38,11 +39,11 @@ class Subtract extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val leftDF = in.read(Port.LeftPort)
- val rightDF = in.read(Port.RightPort)
+ val leftDF = in.read(Port.LeftPort).getSparkDf
+ val rightDF = in.read(Port.RightPort).getSparkDf
val outDF = leftDF.except(rightDF)
- out.write(outDF)
+ out.write(new SciDataFrame(outDF))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvParser.scala
index 9031b196..b3b459ac 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvParser.scala
@@ -1,6 +1,7 @@
package cn.piflow.bundle.csv
import cn.piflow._
+import cn.piflow.util.SciDataFrame
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
@@ -23,13 +24,13 @@ class CsvParser extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- var csvDF:DataFrame = null
+ var csvDF:SciDataFrame = null
if (header){
- csvDF = spark.read
+ csvDF.setSparkDf(spark.read
.option("header",header)
.option("inferSchema","true")
.option("delimiter",delimiter)
- .csv(csvPath)
+ .csv(csvPath))
}else{
@@ -40,13 +41,13 @@ class CsvParser extends ConfigurableStop{
}
val schemaStructType = StructType(structFieldArray)
- csvDF = spark.read
+ csvDF.setSparkDf(spark.read
.option("header",header)
.option("inferSchema","false")
.option("delimiter",delimiter)
.option("timestampFormat","yyyy/MM/dd HH:mm:ss ZZ")
.schema(schemaStructType)
- .csv(csvPath)
+ .csv(csvPath))
}
out.write(csvDF)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvSave.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvSave.scala
index ff422cc2..31bfae7b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvSave.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvSave.scala
@@ -5,6 +5,36 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.execution.arrow.ArrowConverters
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.util.ArrowUtils
+import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.ipc.{ArrowFileWriter, WriteChannel}
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, Float8Vector, IntVector, ValueVector, VarCharVector, VectorSchemaRoot}
+import org.apache.arrow.vector.util.VectorBatchAppender
+
+import java.io.{File, FileOutputStream}
+import java.net.{ServerSocket, Socket}
+import java.nio.channels.Channels
+import scala.collection.JavaConverters._
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.types.{FloatingPointPrecision, TimeUnit}
+import org.apache.spark.sql.types._
+
+import java.nio.charset.StandardCharsets
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf._
+import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
+import org.apache.arrow.vector.types.{FloatingPointPrecision, TimeUnit}
+import org.apache.arrow.vector.{BigIntVector, BitVector, DateDayVector, Float8Vector, IntVector, ValueVector, VarCharVector}
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.types.{BinaryType, BooleanType, DataType, DateType, DoubleType, FloatType, IntegerType, LongType, StringType, StructType, TimestampType}
+
+import java.nio.charset.StandardCharsets
class CsvSave extends ConfigurableStop{
val authorEmail: String = "xjzhu@cnic.cn"
@@ -94,9 +124,109 @@ class CsvSave extends ConfigurableStop{
override def initialize(ctx: ProcessContext): Unit = {
}
+ def sparkTypeToArrowType(dataType: DataType): ArrowType = dataType match {
+ case IntegerType => new ArrowType.Int(32, true)
+ case LongType => new ArrowType.Int(64, true)
+ case FloatType => new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
+ case DoubleType => new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
+ case StringType => new ArrowType.Utf8()
+ case BooleanType => ArrowType.Bool.INSTANCE
+ case BinaryType => ArrowType.Binary.INSTANCE
+ case TimestampType => new ArrowType.Timestamp(TimeUnit.MILLISECOND, null)
+ case _ => throw new UnsupportedOperationException(s"Unsupported type: $dataType")
+ }
+
+ def toArrowSchema(schema: StructType): Schema = {
+ val fields = schema.fields.map { field =>
+ new Field(
+ field.name,
+ FieldType.nullable(sparkTypeToArrowType(field.dataType)),
+ null
+ )
+ }.toList
+ new Schema(fields.asJava)
+ }
+
+ type FieldProcessor = (Int, Any) => Unit
+ private def createFieldProcessor(sparkType: DataType, vector: ValueVector): FieldProcessor =
+ (sparkType, vector) match {
+ // Int 类型 (Integer/Numeric)
+ case (_: IntegerType, vec: IntVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Int])
+ // 字符串类型
+ case (_: StringType, vec: VarCharVector) => (rowIdx, value) =>
+ if (value == null) {
+ vec.setNull(rowIdx)
+ } else {
+ val strValue = value.toString
+ val bytes = strValue.getBytes(StandardCharsets.UTF_8)
+ vec.setSafe(rowIdx, bytes, 0, bytes.length)
+ }
+ // Double 类型
+ case (_: DoubleType, vec: Float8Vector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Double])
+ // Long 类型
+ case (_: LongType, vec: BigIntVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Long])
+ // Boolean 类型(使用 BitVector)
+ case (_: BooleanType, vec: BitVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, if (value.asInstanceOf[Boolean]) 1 else 0)
+ // Date类型(示例)
+ case (_: DateType, vec: DateDayVector) => (rowIdx, value) =>
+ if (value == null) vec.setNull(rowIdx)
+ else vec.setSafe(rowIdx, value.asInstanceOf[Int]) // 需根据实际日期格式转换
+ case _ => throw new IllegalArgumentException(
+ s"Unsupported type combination: SparkType=$sparkType, VectorType=${vector.getClass}"
+ )
+ }
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read()
+ val df = in.read().getSparkDf
+
+ val allocator = new RootAllocator(Long.MaxValue)
+ val arrowSchema = toArrowSchema(df.schema)
+ val root = VectorSchemaRoot.create(arrowSchema, allocator)
+
+ val serverSocket = new ServerSocket(9090)
+
+ println("Server is listening on port 9090")
+
+
+ try {
+ root.allocateNew()
+ // 创建类型映射的字段处理器
+ val fieldProcessors = df.schema.zipWithIndex.map { case (field, idx) =>
+ createFieldProcessor(field.dataType, root.getVector(idx)) // 动态绑定对应 Vector 类型
+ }
+ // 逐行处理数据
+ val rows = df.collect().toList
+ root.setRowCount(rows.size)
+ for {
+ (row, rowIndex) <- rows.zipWithIndex
+ (value, processor) <- row.toSeq.zip(fieldProcessors)
+ } {
+ processor(rowIndex, value) // 类型安全地写入数据
+ }
+ val socket: Socket = serverSocket.accept()
+ val writer = new ArrowFileWriter(root, null, Channels.newChannel(socket.getOutputStream))
+ try {
+
+ writer.start()
+ writer.writeBatch()
+ writer.end()
+ } finally {
+ writer.close()
+ socket.close()
+ }
+ } finally {
+ root.close()
+ allocator.close()
+ }
+
if("".equals(partition)){
df.write
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvStringParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvStringParser.scala
index 13e6abc1..60ac8e95 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvStringParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/csv/CsvStringParser.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
@@ -45,7 +46,7 @@ class CsvStringParser extends ConfigurableStop{
val fields: Array[StructField] = schema.split(",").map(d=>StructField(d.trim,StringType,nullable = true))
val NewSchema: StructType = StructType(fields)
Fdf = session.createDataFrame(rowRDD,NewSchema)
- out.write(Fdf)
+ out.write(new SciDataFrame(Fdf))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/PutElasticsearch.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/PutElasticsearch.scala
index 22738c4a..0771630a 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/PutElasticsearch.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/PutElasticsearch.scala
@@ -20,7 +20,7 @@ class PutElasticsearch extends ConfigurableStop {
var saveMode : String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val inDfES = in.read()
+ val inDfES = in.read().getSparkDf
inDfES.write.format("org.elasticsearch.spark.sql")
.option("es.nodes", es_nodes)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/ReadElasticsearch.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/ReadElasticsearch.scala
index d5c7a88b..80a76697 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/ReadElasticsearch.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/elasticsearch/ReadElasticsearch.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.elasticsearch
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
@@ -26,7 +27,7 @@ class ReadElasticsearch extends ConfigurableStop {
.option("es.port", es_port)
.load(s"${es_index}/${es_type}")
- out.write(esDF)
+ out.write(new SciDataFrame(esDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelRead.scala
index 3faa7fea..ba77c9b0 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelRead.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.excel
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,7 @@ class ExcelRead extends ConfigurableStop{
.option("header", header)
.load(filePath)
- out.write(frame)
+ out.write(new SciDataFrame(frame))
}
override def setProperties(map: Map[String, Any]): Unit = {
@@ -112,7 +113,7 @@ class ExcelRead extends ConfigurableStop{
}
override def getIcon(): Array[Byte] = {
- ImageUtil.getImage("icon/excel/excelParse.png",this.getClass.getName)
+ ImageUtil.getImage("icon/excel/excelParse.png")
}
override def getGroup(): List[String] = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWrite.scala
index 82e2e6dd..0c3237af 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWrite.scala
@@ -17,7 +17,7 @@ class ExcelWrite extends ConfigurableStop{
var saveMode: String = _
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read()
+ val df = in.read().getSparkDf
df.write
.format("com.crealytics.spark.excel")
.option("dataAddress",dataAddress)
@@ -84,7 +84,7 @@ class ExcelWrite extends ConfigurableStop{
}
override def getIcon(): Array[Byte] = {
- ImageUtil.getImage("icon/excel/excelParse.png",this.getClass.getName)
+ ImageUtil.getImage("icon/excel/excelParse.png")
}
override def getGroup(): List[String] = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWriteMultipleSheets.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWriteMultipleSheets.scala
new file mode 100644
index 00000000..1ef00677
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/excel/ExcelWriteMultipleSheets.scala
@@ -0,0 +1,85 @@
+package cn.piflow.bundle.excel
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+
+class ExcelWriteMultipleSheets extends ConfigurableStop{
+ val authorEmail: String = "ygang@cnic.cn"
+ val description: String = "Write multiple DataFrames into multiple sheets of the same Excel file"
+ val inportList: List[String] = List(Port.AnyPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var header: String = _
+
+ var inports : List[String] = _
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ inports.foreach(x=>{
+ val df = in.read(x).getSparkDf
+ df.write
+ .format("com.crealytics.spark.excel")
+ .option("dataAddress",s"'${x}'!A1")
+ .option("header", header)
+ .mode("append")
+ .save(filePath)
+ })
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ val inportStr = MapUtil.get(map,"inports").asInstanceOf[String]
+ inports = inportStr.split(",").map(x => x.trim).toList
+
+ filePath = MapUtil.get(map,"filePath").asInstanceOf[String]
+ header = MapUtil.get(map,"header").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor : List[PropertyDescriptor] = List()
+
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of excel file")
+ .defaultValue("")
+ .required(true)
+ .example("/test/test.xlsx")
+ descriptor = filePath :: descriptor
+
+ val header = new PropertyDescriptor()
+ .name("header")
+ .displayName("Header")
+ .description("Whether the excel file has a header")
+ .defaultValue("true")
+ .allowableValues(Set("true","false"))
+ .required(true)
+ .example("true")
+ descriptor = header :: descriptor
+
+ val inports = new PropertyDescriptor()
+ .name("inports")
+ .displayName("inports")
+ .description("Inports string are separated by commas")
+ .defaultValue("")
+ .required(true)
+ descriptor = inports :: descriptor
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/excel/excelParse.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.ExcelGroup)
+ }
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/file/RegexTextProcess.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/file/RegexTextProcess.scala
index ff4f0f50..352cb6c5 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/file/RegexTextProcess.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/file/RegexTextProcess.scala
@@ -4,6 +4,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
class RegexTextProcess extends ConfigurableStop{
@@ -19,14 +20,14 @@ class RegexTextProcess extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dfOld = in.read()
+ val dfOld = in.read().getSparkDf
val regexText=regex
val replaceText=replaceStr
dfOld.createOrReplaceTempView("thesis")
sqlContext.udf.register("regexPro",(str:String)=>str.replaceAll(regexText,replaceText))
val sqlText:String="select *,regexPro("+columnName+") as "+columnName+"_new from thesis"
val dfNew=sqlContext.sql(sqlText)
- out.write(dfNew)
+ out.write(new SciDataFrame(dfNew))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LabelPropagation.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LabelPropagation.scala
index 8867f67d..f95a0541 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LabelPropagation.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LabelPropagation.scala
@@ -7,7 +7,7 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.graphx._
import org.apache.spark.graphx.lib.LabelPropagation
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LabelPropagation extends ConfigurableStop {
val authorEmail: String = "06whuxx@163.com"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LoadGraph.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LoadGraph.scala
index 1a2135f3..4b8755f2 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LoadGraph.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/graphx/LoadGraph.scala
@@ -6,6 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.graphx.{GraphLoader, PartitionStrategy}
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LoadGraph extends ConfigurableStop {
val authorEmail: String = "06whuxx@163.com"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/PutHbase.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/PutHbase.scala
index b2072299..3d0b3def 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/PutHbase.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/PutHbase.scala
@@ -13,7 +13,26 @@ import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * Copyright (c) 2022 πFlow. All rights reserved.
+ */
class PutHbase extends ConfigurableStop{
override val authorEmail: String = "ygang@cnic.cn"
@@ -28,7 +47,7 @@ class PutHbase extends ConfigurableStop{
var columnFamily: String = _
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df = in.read()
+ val df = in.read().getSparkDf
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum",zookeeperQuorum) //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/ReadHbase.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/ReadHbase.scala
index 595fe93b..9512ca5a 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/ReadHbase.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hbase/ReadHbase.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types.{StringType, StructField, StructType}
@@ -12,7 +13,26 @@ import org.apache.hadoop.hbase.HBaseConfiguration
import scala.collection.mutable.ArrayBuffer
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * Copyright (c) 2022 πFlow. All rights reserved.
+ */
class ReadHbase extends ConfigurableStop{
override val authorEmail: String = "ygang@cnic.cn"
@@ -78,7 +98,7 @@ class ReadHbase extends ConfigurableStop{
})
val df=spark.createDataFrame(kv,dfSchema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/DeleteHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/DeleteHdfs.scala
index 94dbdf8e..3338b939 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/DeleteHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/DeleteHdfs.scala
@@ -27,7 +27,7 @@ class DeleteHdfs extends ConfigurableStop{
val spark = pec.get[SparkSession]()
if (isCustomize.equals("false")){
- val inDf = in.read()
+ val inDf = in.read().getSparkDf
val configuration: Configuration = new Configuration()
var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String]
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/FileDownHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/FileDownHdfs.scala
index 6e436d5d..6bc05c9d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/FileDownHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/FileDownHdfs.scala
@@ -2,10 +2,10 @@ package cn.piflow.bundle.hdfs
import java.io.InputStream
import java.net.{HttpURLConnection, URL}
-
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
@@ -59,7 +59,7 @@ class FileDownHdfs extends ConfigurableStop{
val schema: StructType = StructType(fields)
val df: DataFrame = spark.createDataFrame(rdd,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/GetHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/GetHdfs.scala
index 444454d7..7bfadf2f 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/GetHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/GetHdfs.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -27,28 +28,28 @@ class GetHdfs extends ConfigurableStop{
if (types == "json") {
val df = spark.read.json(path)
df.schema.printTreeString()
- out.write(df)
+ out.write(new SciDataFrame(df))
} else if (types == "csv") {
val df = spark.read.csv(path)
df.schema.printTreeString()
- out.write(df)
+ out.write(new SciDataFrame(df))
}else if (types == "parquet") {
val df = spark.read.csv(path)
df.schema.printTreeString()
- out.write(df)
+ out.write(new SciDataFrame(df))
}else if (types == "orc"){
val df = spark.read.orc(path)
df.schema.printTreeString()
- out.write(df)
+ out.write(new SciDataFrame(df))
}
else {
val rdd = sc.textFile(path)
val outDf = rdd.toDF()
outDf.schema.printTreeString()
- out.write(outDf)
+ out.write(new SciDataFrame(outDf))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/ListHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/ListHdfs.scala
index ac58b6ec..7b7ea30b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/ListHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/ListHdfs.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.spark.rdd.RDD
@@ -44,7 +45,7 @@ class ListHdfs extends ConfigurableStop{
StructField("path",StringType)
))
val outDF: DataFrame = spark.createDataFrame(rowRDD,schema)
- out.write(outDF)
+ out.write(new SciDataFrame(outDF))
}
// recursively traverse the folder
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/PutHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/PutHdfs.scala
index 310db5d6..80e7c135 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/PutHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/PutHdfs.scala
@@ -23,7 +23,7 @@ class PutHdfs extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
val config = new Configuration()
config.set("fs.defaultFS",hdfsUrl)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SaveToHdfs.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SaveToHdfs.scala
index 86f16518..7d8d3f8e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SaveToHdfs.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SaveToHdfs.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.hdfs
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
@@ -38,7 +39,7 @@ class SaveToHdfs extends ConfigurableStop {
config.set("fs.defaultFS",hdfsUrl)
val fs = FileSystem.get(config)
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
if (types=="json"){
@@ -76,7 +77,7 @@ class SaveToHdfs extends ConfigurableStop {
))
val outDF: DataFrame = spark.createDataFrame(rowRDD,schema)
- out.write(outDF)
+ out.write(new SciDataFrame(outDF))
}
// recursively traverse the folder
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SelectFilesByName.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SelectFilesByName.scala
index 8df119e2..cb0d5fea 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SelectFilesByName.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hdfs/SelectFilesByName.scala
@@ -1,10 +1,10 @@
package cn.piflow.bundle.hdfs
import java.util.regex.Pattern
-
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
@@ -69,7 +69,7 @@ class SelectFilesByName extends ConfigurableStop{
val df: DataFrame = session.createDataFrame(rowRDD,schema)
df.collect().foreach(println)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveMode.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveMode.scala
index 784f5edf..bbac9f8c 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveMode.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveMode.scala
@@ -19,7 +19,7 @@ class PutHiveMode extends ConfigurableStop {
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
inDF.write.format("hive").mode(saveMode).saveAsTable(database + "." + table)
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveStreaming.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveStreaming.scala
index fb43edae..59507f08 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveStreaming.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/PutHiveStreaming.scala
@@ -20,7 +20,7 @@ class PutHiveStreaming extends ConfigurableStop {
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
val dfTempTable = table + "_temp"
inDF.createOrReplaceTempView(dfTempTable)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQL.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQL.scala
index a3d11de5..007557f9 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQL.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQL.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
import scala.beans.BeanProperty
@@ -25,7 +26,7 @@ class SelectHiveQL extends ConfigurableStop {
import spark.sql
val df = sql(hiveQL)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQLByJDBC.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQLByJDBC.scala
index 1e896c15..75852f39 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQLByJDBC.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/hive/SelectHiveQLByJDBC.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SparkSession}
@@ -88,7 +89,7 @@ class SelectHiveQLByJDBC extends ConfigurableStop {
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val sc = pec.get[SparkSession]()
val df = getDF (sc.sqlContext, sc.sparkContext, sql)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def getDF(sqlContext : SQLContext, sc : SparkContext, tableName : String) : DataFrame = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/GetUrl.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/GetUrl.scala
index 40000a14..46995e57 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/http/GetUrl.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/http/GetUrl.scala
@@ -1,7 +1,6 @@
package cn.piflow.bundle.http
import java.util
-
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
@@ -13,6 +12,8 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.dom4j.{Document, DocumentHelper, Element}
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+import cn.piflow.util.SciDataFrame
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
@@ -112,7 +113,7 @@ class GetUrl extends ConfigurableStop{
val outDf: DataFrame = ss.createDataFrame(rowRDD,structType)
- out.write(outDf)
+ out.write(new SciDataFrame(outDf))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/imageProcess/AnimalClassification.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/imageProcess/AnimalClassification.scala
index 7819f6ac..ff20e347 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/imageProcess/AnimalClassification.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/imageProcess/AnimalClassification.scala
@@ -1,11 +1,11 @@
package cn.piflow.bundle.imageProcess
import java.io.{File, FileNotFoundException}
-
import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.http.entity.ContentType
import org.apache.http.util.EntityUtils
import org.apache.spark.rdd.RDD
@@ -89,7 +89,7 @@ class AnimalClassification extends ConfigurableStop {
StructField("res",StringType)
))
val df: DataFrame = session.createDataFrame(rowRDD,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/internetWorm/spider.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/internetWorm/spider.scala
index f6794590..fa1f7e1b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/internetWorm/spider.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/internetWorm/spider.scala
@@ -4,10 +4,10 @@ import java.io.{BufferedOutputStream, File, FileOutputStream, InputStream}
import java.net.URL
import java.text.SimpleDateFormat
import java.util.Date
-
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
@@ -92,7 +92,7 @@ class spider extends ConfigurableStop{
val schema: StructType = StructType(fields)
val df: DataFrame = session.createDataFrame(rowRDD,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengRead.scala
new file mode 100644
index 00000000..283a9d23
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengRead.scala
@@ -0,0 +1,111 @@
+package cn.piflow.bundle.jdbc
+
+import cn.piflow._
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
+import org.apache.spark.sql.SparkSession
+
+class DamengRead extends ConfigurableStop {
+
+ val authorEmail: String = "ygang@cnic.cn"
+ val description: String = "Read data from dameng database with jdbc"
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var url:String = _
+ var user:String = _
+ var password:String = _
+ var selectedContent:String = _
+ var tableName:String = _
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+
+ val spark = pec.get[SparkSession]()
+ val dbtable = "( select " + selectedContent + " from " + tableName + " ) AS Temp"
+ val jdbcDF = spark.read.format("jdbc")
+ .option("url", url)
+ .option("driver", "dm.jdbc.driver.DmDrive")
+ .option("dbtable", dbtable)
+ .option("user", user)
+ .option("password",password)
+ .load()
+
+ out.write(new SciDataFrame(jdbcDF))
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+
+ url = MapUtil.get(map,"url").asInstanceOf[String]
+ user = MapUtil.get(map,"user").asInstanceOf[String]
+ password = MapUtil.get(map,"password").asInstanceOf[String]
+ selectedContent= MapUtil.get(map,"selectedContent").asInstanceOf[String]
+ tableName= MapUtil.get(map,"tableName").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor : List[PropertyDescriptor] = List()
+
+ val url=new PropertyDescriptor()
+ .name("url")
+ .displayName("Url")
+ .description("The Url of dameng database")
+ .defaultValue("")
+ .required(true)
+ .example("jdbc:dm://127.0.0.1:5236/DAMENG")
+ descriptor = url :: descriptor
+
+
+ val user=new PropertyDescriptor()
+ .name("user")
+ .displayName("User")
+ .description("The user name of dameng")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = user :: descriptor
+
+ val password=new PropertyDescriptor()
+ .name("password")
+ .displayName("Password")
+ .description("The password of dameng")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ .sensitive(true)
+ descriptor = password :: descriptor
+
+ val selectedContent =new PropertyDescriptor()
+ .name("selectedContent")
+ .displayName("SelectedContent")
+ .description("The content you selected to read in the DBTable")
+ .defaultValue("*")
+ .required(true)
+ .example("*")
+ descriptor = selectedContent :: descriptor
+
+ val tableName =new PropertyDescriptor()
+ .name("tableName")
+ .displayName("TableName")
+ .description("The table you want to read")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = tableName :: descriptor
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/jdbc/dameng.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.JdbcGroup)
+ }
+
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengWrite.scala
new file mode 100644
index 00000000..d971a083
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/DamengWrite.scala
@@ -0,0 +1,109 @@
+package cn.piflow.bundle.jdbc
+
+import cn.piflow._
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+
+class DamengWrite extends ConfigurableStop{
+
+ val authorEmail: String = "ygang@cnic.cn"
+ val description: String = "Write data into dameng database with jdbc"
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var url:String = _
+ var user:String = _
+ var password:String = _
+ var dbtable:String = _
+ var saveMode:String = _
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val jdbcDF = in.read().getSparkDf
+
+ jdbcDF.write.format("jdbc")
+ .option("url", url)
+ .option("driver", "dm.jdbc.driver.DmDriver")
+ .option("user", user)
+ .option("password", password)
+ .option("dbtable", dbtable)
+ .mode(saveMode)
+ .save()
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ url = MapUtil.get(map,"url").asInstanceOf[String]
+ user = MapUtil.get(map,"user").asInstanceOf[String]
+ password = MapUtil.get(map,"password").asInstanceOf[String]
+ dbtable = MapUtil.get(map,"dbtable").asInstanceOf[String]
+ saveMode = MapUtil.get(map,"saveMode").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor : List[PropertyDescriptor] = List()
+ val saveModeOption = Set("Append", "Overwrite", "Ignore")
+
+ val url=new PropertyDescriptor()
+ .name("url")
+ .displayName("Url")
+ .description("The Url of dameng database")
+ .defaultValue("")
+ .required(true)
+ .example("jdbc:dm://127.0.0.1:5236/DAMENG")
+ descriptor = url :: descriptor
+
+
+ val user=new PropertyDescriptor()
+ .name("user")
+ .displayName("User")
+ .description("The user name of dameng")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = user :: descriptor
+
+ val password=new PropertyDescriptor()
+ .name("password")
+ .displayName("Password")
+ .description("The password of dameng")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ .sensitive(true)
+ descriptor = password :: descriptor
+
+ val dbtable=new PropertyDescriptor()
+ .name("dbtable")
+ .displayName("DBTable")
+ .description("The table you want to write")
+ .defaultValue("")
+ .required(true)
+ .example("")
+ descriptor = dbtable :: descriptor
+
+ val saveMode = new PropertyDescriptor()
+ .name("saveMode")
+ .displayName("SaveMode")
+ .description("The save mode for table")
+ .allowableValues(saveModeOption)
+ .defaultValue("Append")
+ .required(true)
+ .example("Append")
+ descriptor = saveMode :: descriptor
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/jdbc/dameng.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.JdbcGroup)
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ExcuteSql.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ExcuteSql.scala
index bcd0a029..71c84ec3 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ExcuteSql.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ExcuteSql.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
import java.sql.{Connection, DriverManager, ResultSet}
@@ -33,7 +34,7 @@ class ExcuteSql extends ConfigurableStop {
conn.close()
statement.close()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ImpalaRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ImpalaRead.scala
index b42115e8..336ce54c 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ImpalaRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/ImpalaRead.scala
@@ -1,15 +1,15 @@
package cn.piflow.bundle.jdbc
-import java.sql.{Connection, DriverManager, ResultSet, Statement}
-
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import java.sql.{Connection, DriverManager, ResultSet, Statement}
import scala.collection.mutable.ArrayBuffer
@@ -55,7 +55,7 @@ class ImpalaRead extends ConfigurableStop{
val rdd: RDD[Row] = session.sparkContext.makeRDD(rows)
val df: DataFrame = session.createDataFrame(rdd,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/JdbcReadFromOracle.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/JdbcReadFromOracle.scala
deleted file mode 100644
index 53dac916..00000000
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/JdbcReadFromOracle.scala
+++ /dev/null
@@ -1,210 +0,0 @@
-package cn.piflow.bundle.jdbc
-
-import java.io._
-import java.sql.{Blob, Clob, Connection, Date, DriverManager, NClob, PreparedStatement, ResultSet, SQLXML}
-
-import cn.piflow._
-import cn.piflow.conf._
-import cn.piflow.conf.bean.PropertyDescriptor
-import cn.piflow.conf.util.{ImageUtil, MapUtil}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.types._
-
-import scala.collection.mutable.ArrayBuffer
-
-class JdbcReadFromOracle extends ConfigurableStop{
-
- val authorEmail: String = "yangqidong@cnic.cn"
- val description: String = "Read from oracle"
- val inportList: List[String] = List(Port.DefaultPort)
- val outportList: List[String] = List(Port.DefaultPort)
-
- var url:String = _
- var user:String = _
- var password:String = _
- var sql:String = _
- var schema:String=_
-
-
- def toByteArray(in: InputStream): Array[Byte] = {
- var byteArray:Array[Byte]=new Array[Byte](1024*1024)
- val out: ByteArrayOutputStream = new ByteArrayOutputStream()
- var n:Int=0
- while ((n=in.read(byteArray)) != -1 && (n != -1)){
- out.write(byteArray,0,n)
- }
- val arr: Array[Byte] = out.toByteArray
- out.close()
- arr
- }
-
- def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val session = pec.get[SparkSession]()
-
- Class.forName("oracle.jdbc.driver.OracleDriver")
- val con: Connection = DriverManager.getConnection(url,user,password)
- val pre: PreparedStatement = con.prepareStatement(sql)
- val rs: ResultSet = pre.executeQuery()
-
-
- val filedNames: Array[String] = schema.split(",").map(x => x.trim)
- var rowsArr:ArrayBuffer[ArrayBuffer[Any]]=ArrayBuffer()
- var rowArr:ArrayBuffer[Any]=ArrayBuffer()
- while (rs.next()){
- rowArr.clear()
- for(fileName <- filedNames){
- val name_type: Array[String] = fileName.split("\\.")
- val name: String = name_type(0)
- val typestr: String = name_type(1)
- if(typestr.toUpperCase.equals("BLOB")){
- val blob: Blob = rs.getBlob(name)
- var byteArr : Array[Byte] =Array()
- if(blob != null){
- val stream: InputStream = blob.getBinaryStream
- byteArr = toByteArray(stream)
- stream.close()
- }
- rowArr+=byteArr
- }else if(typestr.toUpperCase.equals("CLOB") || typestr.toUpperCase.equals("XMLTYPE")){
- val clob: Clob = rs.getClob(name)
- var byteArr : Array[Byte] =Array()
- if(clob != null){
- val stream: InputStream = clob.getAsciiStream
- byteArr = toByteArray(stream)
- stream.close()
- }
- rowArr+=byteArr
- }else if(typestr.toUpperCase.equals("NCLOB")){
- val nclob: NClob = rs.getNClob(name)
- var byteArr : Array[Byte] =Array()
- if(nclob != null){
- val stream: InputStream = nclob.getAsciiStream
- byteArr = toByteArray(stream)
- stream.close()
- }
- rowArr+=byteArr
- }else if(typestr.toUpperCase.equals("DATE")){
- val date: Date = rs.getDate(name)
- rowArr+=date
- }else if(typestr.toUpperCase.equals("NUMBER")){
- val int: Int = rs.getInt(name)
- rowArr+=int
- }else{
- rowArr+=rs.getString(name)
- }
- }
- rowsArr+=rowArr
- }
-
- var nameArrBuff:ArrayBuffer[String]=ArrayBuffer()
- var typeArrBuff:ArrayBuffer[String]=ArrayBuffer()
- filedNames.foreach(x => {
- nameArrBuff+=x.split("\\.")(0)
- typeArrBuff+=x.split("\\.")(1)
- })
- var num:Int=0
- val fields: ArrayBuffer[StructField] = nameArrBuff.map(x => {
- var sf: StructField = null
- val typeName: String = typeArrBuff(num)
- if (typeName.toUpperCase.equals("BLOB") || typeName.toUpperCase.equals("CLOB") || typeName.toUpperCase.equals("NCLOB") || typeName.toUpperCase.equals("XMLTYPE")) {
- sf = StructField(x, DataTypes.createArrayType(ByteType), nullable = true)
- }else if( typeName.toUpperCase.equals("DATE")) {
- sf = StructField(x, DateType, nullable = true)
- }else if( typeName.toUpperCase.equals("NUMBER")) {
- sf = StructField(x, IntegerType, nullable = true)
- }else if( typeName.toUpperCase.equals("XMLTYPE")) {
- sf = StructField(x, IntegerType, nullable = true)
- }else {
- sf = StructField(x, StringType, nullable = true)
- }
- num+=1
- sf
- })
-
- val schemaNew: StructType = StructType(fields)
- val rows: List[Row] = rowsArr.toList.map(arr => {
-
- val row: Row = Row.fromSeq(arr)
- row
- })
- val rdd: RDD[Row] = session.sparkContext.makeRDD(rows)
- val df: DataFrame = session.createDataFrame(rdd,schemaNew)
-
- out.write(df)
- }
-
- def initialize(ctx: ProcessContext): Unit = {
-
- }
-
- override def setProperties(map: Map[String, Any]): Unit = {
- url = MapUtil.get(map,"url").asInstanceOf[String]
- user = MapUtil.get(map,"user").asInstanceOf[String]
- password = MapUtil.get(map,"password").asInstanceOf[String]
- sql = MapUtil.get(map,"sql").asInstanceOf[String]
- schema = MapUtil.get(map,"schema").asInstanceOf[String]
- }
-
- override def getPropertyDescriptor(): List[PropertyDescriptor] = {
- var descriptor : List[PropertyDescriptor] = List()
-
- val url=new PropertyDescriptor()
- .name("url")
- .displayName("Url")
- .description("The Url, for example jdbc:oracle:thin:@192.168.0.1:1521/newdb")
- .defaultValue("")
- .required(true)
- .example("jdbc:oracle:thin:@192.168.0.1:1521/newdb")
- descriptor = url :: descriptor
-
- val user=new PropertyDescriptor()
- .name("user")
- .displayName("User")
- .description("The user name of database")
- .defaultValue("")
- .required(true)
- .example("root")
- descriptor = user :: descriptor
-
- val password=new PropertyDescriptor()
- .name("password")
- .displayName("Password")
- .description("The password of database")
- .defaultValue("")
- .required(true)
- .example("123456")
- descriptor = password :: descriptor
-
- val sql=new PropertyDescriptor()
- .name("sql")
- .displayName("Sql")
- .description("The sql you want")
- .defaultValue("")
- .required(true)
- .language(Language.Sql)
- .example("select * from type")
- descriptor = sql :: descriptor
-
- val schema=new PropertyDescriptor()
- .name("schema")
- .displayName("Schema")
- .description("The name of the field of your SQL statement query, such as: ID.number, name.varchar")
- .defaultValue("")
- .required(true)
- .example("ID.number, name.varchar")
- descriptor = schema :: descriptor
-
- descriptor
- }
-
- override def getIcon(): Array[Byte] = {
- ImageUtil.getImage("icon/jdbc/jdbcReadFromOracle.png")
- }
-
- override def getGroup(): List[String] = {
- List(StopGroup.JdbcGroup)
- }
-
-
-}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlRead.scala
index 34746f77..ec1fd6f7 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlRead.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -31,7 +32,7 @@ class MysqlRead extends ConfigurableStop {
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlReadIncremental.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlReadIncremental.scala
index 8a88b0e4..5d9fbe7b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlReadIncremental.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlReadIncremental.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableIncrementalStop, Language, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
/**
@@ -33,7 +34,7 @@ class MysqlReadIncremental extends ConfigurableIncrementalStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
url = MapUtil.get(map,"url").asInstanceOf[String]
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlWrite.scala
index 4f5a5e6a..1034fad1 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/MysqlWrite.scala
@@ -1,11 +1,11 @@
package cn.piflow.bundle.jdbc
import java.util.Properties
-
import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.beans.BeanProperty
@@ -26,13 +26,13 @@ class MysqlWrite extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val jdbcDF = in.read()
+ val jdbcDF = in.read().getSparkDf
val properties = new Properties()
properties.put("user", user)
properties.put("password", password)
properties.put("driver", driver)
jdbcDF.write.mode(SaveMode.valueOf(saveMode)).jdbc(url,dbtable,properties)
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseRead.scala
similarity index 84%
rename from piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseRead.scala
rename to piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseRead.scala
index 3f24de77..56877a16 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseRead.scala
@@ -4,13 +4,14 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
-class TbaseRead extends ConfigurableStop {
+class OpenTenBaseRead extends ConfigurableStop {
- val authorEmail: String = "bbbbbbyz1110@163.com"
- val description: String = "Read data from Tbase database with jdbc"
+ val authorEmail: String = "ygang@cnic.cn"
+ val description: String = "Read data from OpenTenBase database with jdbc"
val inportList: List[String] = List(Port.DefaultPort)
val outportList: List[String] = List(Port.DefaultPort)
@@ -32,7 +33,7 @@ class TbaseRead extends ConfigurableStop {
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
@@ -55,8 +56,8 @@ class TbaseRead extends ConfigurableStop {
val url=new PropertyDescriptor()
.name("url")
.displayName("Url")
- .description("The Url of postgresql database")
- .defaultValue("jdbc:postgresql://127.0.0.1:30004/tbase")
+ .description("The Url of OpenTenBase database")
+ .defaultValue("")
.required(true)
.example("jdbc:postgresql://127.0.0.1:30004/tbase")
descriptor = url :: descriptor
@@ -65,19 +66,19 @@ class TbaseRead extends ConfigurableStop {
val user=new PropertyDescriptor()
.name("user")
.displayName("User")
- .description("The user name of postgresql")
- .defaultValue("tbase")
+ .description("The user name of OpenTenBase")
+ .defaultValue("")
.required(true)
- .example("tbase")
+ .example("")
descriptor = user :: descriptor
val password=new PropertyDescriptor()
.name("password")
.displayName("Password")
- .description("The password of postgresql")
+ .description("The password of OpenTenBase")
.defaultValue("")
.required(true)
- .example("123456")
+ .example("")
.sensitive(true)
descriptor = password :: descriptor
@@ -96,7 +97,7 @@ class TbaseRead extends ConfigurableStop {
.description("The table you want to read")
.defaultValue("")
.required(true)
- .example("test")
+ .example("")
descriptor = tableName :: descriptor
descriptor
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseWrite.scala
similarity index 84%
rename from piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseWrite.scala
rename to piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseWrite.scala
index 45a527d6..377e628a 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/TbaseWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OpenTenBaseWrite.scala
@@ -4,15 +4,16 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{SaveMode, SparkSession}
import java.util.Properties
-class TbaseWrite extends ConfigurableStop{
+class OpenTenBaseWrite extends ConfigurableStop{
- val authorEmail: String = "bbbbbbyz1110@163.com"
- val description: String = "Write data into Tbase database with jdbc"
+ val authorEmail: String = "ygang@cnic.cn"
+ val description: String = "Write data into OpenTenBase database with jdbc"
val inportList: List[String] = List(Port.DefaultPort)
val outportList: List[String] = List(Port.DefaultPort)
@@ -24,7 +25,7 @@ class TbaseWrite extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val jdbcDF = in.read()
+ val jdbcDF = in.read().getSparkDf
val properties = new Properties()
properties.put("user", user)
properties.put("password", password)
@@ -32,7 +33,7 @@ class TbaseWrite extends ConfigurableStop{
jdbcDF.write
.mode(SaveMode.valueOf(saveMode)).jdbc(url,dbtable,properties)
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
def initialize(ctx: ProcessContext): Unit = {
@@ -54,8 +55,8 @@ class TbaseWrite extends ConfigurableStop{
val url=new PropertyDescriptor()
.name("url")
.displayName("Url")
- .description("The Url of postgresql database")
- .defaultValue("jdbc:postgresql://127.0.0.1:30004/tbase")
+ .description("The Url of OpenTenBase database")
+ .defaultValue("")
.required(true)
.example("jdbc:postgresql://127.0.0.1:30004/tbase")
descriptor = url :: descriptor
@@ -64,16 +65,16 @@ class TbaseWrite extends ConfigurableStop{
val user=new PropertyDescriptor()
.name("user")
.displayName("User")
- .description("The user name of postgresql")
- .defaultValue("tbase")
+ .description("The user name of OpenTenBase")
+ .defaultValue("")
.required(true)
- .example("tbase")
+ .example("")
descriptor = user :: descriptor
val password=new PropertyDescriptor()
.name("password")
.displayName("Password")
- .description("The password of postgresql")
+ .description("The password of OpenTenBase")
.defaultValue("")
.required(true)
.example("123456")
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleRead.scala
index 34b30a48..9792dd8e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleRead.scala
@@ -4,6 +4,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
/**
@@ -31,7 +32,7 @@ class OracleRead extends ConfigurableStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleReadByPartition.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleReadByPartition.scala
index 06a3d94e..f4b49659 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleReadByPartition.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleReadByPartition.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.jdbc
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
@@ -140,6 +141,6 @@ class OracleReadByPartition extends ConfigurableStop{
.option("numPartitions",numPartitions)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleWrite.scala
index d5a56bad..92aaa453 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/OracleWrite.scala
@@ -22,7 +22,7 @@ class OracleWrite extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val session = pec.get[SparkSession]()
- val inDF: DataFrame = in.read()
+ val inDF: DataFrame = in.read().getSparkDf
Class.forName("oracle.jdbc.driver.OracleDriver")
val con: Connection = DriverManager.getConnection(url,user,password)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlRead.scala
index f6c372f0..2d8ac4e9 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlRead.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -32,7 +33,7 @@ class PostgresqlRead extends ConfigurableStop {
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlWrite.scala
index b9b3210a..b87457c0 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/jdbc/PostgresqlWrite.scala
@@ -1,11 +1,11 @@
package cn.piflow.bundle.jdbc
import java.util.Properties
-
import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{SaveMode, SparkSession}
@@ -24,7 +24,7 @@ class PostgresqlWrite extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val jdbcDF = in.read()
+ val jdbcDF = in.read().getSparkDf
val properties = new Properties()
properties.put("user", user)
properties.put("password", password)
@@ -32,7 +32,7 @@ class PostgresqlWrite extends ConfigurableStop{
jdbcDF.write
.mode(SaveMode.valueOf(saveMode)).jdbc(url,dbtable,properties)
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonParser.scala
index 50b6e83e..4bf1670f 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonParser.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{DataFrame, SparkSession}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class JsonParser extends ConfigurableStop{
val authorEmail: String = "xjzhu@cnic.cn"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonSave.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonSave.scala
index 7a09449d..32b031f2 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonSave.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonSave.scala
@@ -19,7 +19,7 @@ class JsonSave extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val jsonDF = in.read()
+ val jsonDF = in.read().getSparkDf
jsonDF.write.format("json").mode(SaveMode.Overwrite).save(jsonSavePath)
}
@@ -39,7 +39,7 @@ class JsonSave extends ConfigurableStop{
.description("The save path of the json file")
.defaultValue("")
.required(true)
- .example("hdfs://192.168.3.138:8020/work/testJson/test/")
+ .example("/test/test.json")
descriptor = jsonSavePath :: descriptor
descriptor
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonStringParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonStringParser.scala
index c4e45bec..5eff6dd0 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonStringParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/json/JsonStringParser.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class JsonStringParser extends ConfigurableStop{
val authorEmail: String = "xjzhu@cnic.cn"
val description: String = "Parse json string"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/ReadFromKafka.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/ReadFromKafka.scala
index b07e588e..330b0946 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/ReadFromKafka.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/ReadFromKafka.scala
@@ -2,12 +2,12 @@ package cn.piflow.bundle.kafka
import java.util
import java.util.{Collections, Properties}
-
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.bundle.util.JedisClusterImplSer
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StructField, StructType}
@@ -67,7 +67,7 @@ class ReadFromKafka extends ConfigurableStop{
//val newRdd=rdd.map(line=>Row.fromSeq(line.toSeq))
val df=spark.sqlContext.createDataFrame(rdd,dfSchema)
//df.show(20)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/WriteToKafka.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/WriteToKafka.scala
index b72b0932..8dd8dea8 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/WriteToKafka.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/kafka/WriteToKafka.scala
@@ -24,7 +24,7 @@ class WriteToKafka extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val df = in.read()
+ val df = in.read().getSparkDf
val properties:Properties = new Properties()
properties.put("bootstrap.servers", kafka_host)
properties.put("acks", "all")
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/ComplementByMemcache.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/ComplementByMemcache.scala
index ce05e15a..397f6a97 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/ComplementByMemcache.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/ComplementByMemcache.scala
@@ -30,7 +30,7 @@
//
// override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
// val session: SparkSession = pec.get[SparkSession]()
-// val inDF: DataFrame = in.read()
+// val inDF: DataFrame = in.read().getSparkDf
//
// val mcc: MemCachedClient =getMcc()
//
@@ -75,7 +75,7 @@
// val schema: StructType = StructType(fields)
// val df: DataFrame = session.createDataFrame(rowRDD,schema)
//
-// out.write(df)
+// out.write(new SciDataFrame(df))
// }
//
//
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/GetMemcache.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/GetMemcache.scala
index b77a3b14..642227b2 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/GetMemcache.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/GetMemcache.scala
@@ -31,7 +31,7 @@
//
// override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
// val session: SparkSession = pec.get[SparkSession]()
-// val inDF: DataFrame = in.read()
+// val inDF: DataFrame = in.read().getSparkDf
//
// val mcc: MemCachedClient =getMcc()
//
@@ -74,7 +74,7 @@
// val s: StructType = StructType(fields)
// val df: DataFrame = session.createDataFrame(rowRDD,s)
//
-// out.write(df)
+// out.write(new SciDataFrame(df))
// }
//
// def getMcc(): MemCachedClient = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/PutMemcache.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/PutMemcache.scala
index c901b0b6..5873f57d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/PutMemcache.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/memcached/PutMemcache.scala
@@ -26,7 +26,7 @@
// override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
//
// val session: SparkSession = pec.get[SparkSession]()
-// val inDF: DataFrame = in.read()
+// val inDF: DataFrame = in.read().getSparkDf
//
// val pool: SockIOPool = SockIOPool.getInstance()
// var serversArr:Array[String]=servers.split(",").map(x => x.trim)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala
index 1c4139ec..e0ab3955 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreePrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class DecisionTreePrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing decision tree model to predict."
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala
index a73fba54..0da6d966 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/DecisionTreeTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class DecisionTreeTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Train a decision tree model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTPrediction.scala
index 024283a4..f795fbf5 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.GBTClassificationModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GBTPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing GBT Model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTTraining.scala
index e11c8927..49441455 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/GBTTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GBTTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Train a GBT Model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionPrediction.scala
index 0f157625..a3b5a646 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LogisticRegressionPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing logistic regression model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionTraining.scala
index 17e5c562..414c7c32 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/LogisticRegressionTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LogisticRegressionTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Train a logistic regression model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronPrediction.scala
index 7ca9ade2..7e4a6e74 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class MultilayerPerceptronPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing multilayer perceptron model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronTraining.scala
index 481cbea9..bde0b055 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/MultilayerPerceptronTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class MultilayerPerceptronTraining extends ConfigurableStop{
val authorEmail: String = "xiaoxiao@cnic.cn"
val description: String = "Train a multilayer perceptron model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesPrediction.scala
index 7a8e5c18..efbe507d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class NaiveBayesPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing NaiveBayes model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesTraining.scala
index 4060042c..7deb3070 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/NaiveBayesTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class NaiveBayesTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Train a NaiveBayes model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestPrediction.scala
index f6c16169..667c2c3c 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.RandomForestClassificationModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class RandomForestPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "use an existing RandomForest Model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestTraining.scala
index dc999bc6..8813fef2 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_classification/RandomForestTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class RandomForestTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Train a RandomForest model"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansPrediction.scala
index 0768fd31..7d9bde27 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.BisectingKMeansModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class BisectingKMeansPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "use an existing BisectingKMeans model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansTraining.scala
index cdcad59d..33971423 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/BisectingKMeansTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.BisectingKMeans
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class BisectingKMeansTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "BisectingKMeans clustering"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixturePrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixturePrediction.scala
index 6115b74b..1b2fcd0b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixturePrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixturePrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.GaussianMixtureModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GaussianMixturePrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing GaussianMixture Model to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixtureTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixtureTraining.scala
index 8a4abf3e..ceb4e35e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixtureTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/GaussianMixtureTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.GaussianMixture
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GaussianMixtureTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "GaussianMixture clustering"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansPrediction.scala
index db17533d..18cfc2ac 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class KmeansPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing KmeansModel to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansTraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansTraining.scala
index 89382070..5be64f31 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansTraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/KmeansTraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class KmeansTraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Kmeans clustering"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDAPrediction.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDAPrediction.scala
index bd513553..ac9b9e52 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDAPrediction.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDAPrediction.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.{DistributedLDAModel, LDAModel, LocalLDAModel}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LDAPrediction extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Use an existing LDAModel to predict"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDATraining.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDATraining.scala
index ce451cd0..e60e48f3 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDATraining.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_clustering/LDATraining.scala
@@ -6,7 +6,7 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LDATraining extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "LDA clustering"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_feature/WordToVec.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_feature/WordToVec.scala
index 48f7a6a9..fab6edaf 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_feature/WordToVec.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/ml_feature/WordToVec.scala
@@ -7,7 +7,7 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class WordToVec extends ConfigurableStop{
val authorEmail: String = "06whuxx@163.com"
val description: String = "Transfer word to vector"
@@ -25,7 +25,7 @@ class WordToVec extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val df=in.read()
+ val df=in.read().getSparkDf
df.createOrReplaceTempView("doc")
sqlContext.udf.register("split",(str:String)=>str.split(" "))
val sqlText:String="select split("+colName+") as "+colName+"_new from doc"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongo.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongo.scala
index 527fe6ad..4ad4468d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongo.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongo.scala
@@ -14,7 +14,7 @@ import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.bson.Document
import scala.collection.mutable.ArrayBuffer
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GetMongo extends ConfigurableStop{
override val authorEmail: String = "yangqidong@cnic.cn"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongoDB.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongoDB.scala
index 49b8d336..e0535501 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongoDB.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/GetMongoDB.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{DataFrame, SparkSession}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GetMongoDB extends ConfigurableStop{
override val authorEmail: String = "yangqidong@cnic.cn"
override val description: String = "Get data from mongodb"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongo.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongo.scala
index 3a20f819..a85f4038 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongo.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongo.scala
@@ -25,7 +25,7 @@ class PutMongo extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark: SparkSession = pec.get[SparkSession]()
- val df: DataFrame = in.read()
+ val df: DataFrame = in.read().getSparkDf
var addressesArr: util.ArrayList[ServerAddress] = new util.ArrayList[ServerAddress]()
val ipANDport: Array[String] = addresses.split(",").map(x => x.trim)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongoDB.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongoDB.scala
index be54bb16..fef0f22b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongoDB.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/mongodb/PutMongoDB.scala
@@ -20,7 +20,7 @@ class PutMongoDB extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark: SparkSession = pec.get[SparkSession]()
- val df: DataFrame = in.read()
+ val df: DataFrame = in.read().getSparkDf
df.write.options(
Map("spark.mongodb.output.uri" -> ("mongodb://" + ip + ":" + port + "/" + dataBase + "." + collection))
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/neo4j/PutNeo4j.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/neo4j/PutNeo4j.scala
index 945befe3..2c80f81e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/neo4j/PutNeo4j.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/neo4j/PutNeo4j.scala
@@ -21,7 +21,7 @@ class PutNeo4j extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark: SparkSession = pec.get[SparkSession]()
- val inDf: DataFrame = in.read()
+ val inDf: DataFrame = in.read().getSparkDf
val fileNames: Array[String] = inDf.columns
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/nlp/WordSpliter.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/nlp/WordSpliter.scala
index 6e722fc3..c4f757c0 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/nlp/WordSpliter.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/nlp/WordSpliter.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode
import com.huaban.analysis.jieba._
import org.apache.spark.rdd.RDD
@@ -63,7 +64,7 @@ class WordSpliter extends ConfigurableStop {
))
val df: DataFrame = session.createDataFrame(rowRDD,schema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/Discretization.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/Discretization.scala
new file mode 100644
index 00000000..96e7f9bd
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/Discretization.scala
@@ -0,0 +1,171 @@
+package cn.piflow.bundle.normalization
+
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
+import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.ml.feature.Bucketizer
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.ml.feature.QuantileDiscretizer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+
+class Discretization extends ConfigurableStop {
+
+ val authorEmail: String = "zljxnu@163.com"
+ val description: String = "continuous numerical discretization"
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var inputCol: String = _
+ var outputCol: String = _
+ var method: String = _
+ var numBins: Int = _
+ var k: Int = _
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+ val df = in.read().getSparkDf
+
+ // 根据用户选择的方法进行相应的离散化
+ val discretizedDF = method match {
+ case "EqualWidth" => equalWidthDiscretization(df, inputCol, outputCol, numBins)
+ case "EqualFrequency" => equalFrequencyDiscretization(df, inputCol, outputCol, numBins)
+ case "KMeans" => kMeansDiscretization(df, inputCol, outputCol, k)
+ case _ => df // 默认情况下不进行任何处理
+ }
+
+ out.write(discretizedDF)
+ }
+
+ // 等宽法离散化
+ def equalWidthDiscretization(df: DataFrame, inputCol: String, outputCol: String, numBins: Int): DataFrame = {
+ val bucketizer = new Bucketizer()
+ .setInputCol(inputCol)
+ .setOutputCol(outputCol)
+// .setSplits((0 to numBins).map(_.toDouble))
+ .setSplits((0 to numBins).map(_.toDouble).toArray)
+ bucketizer.transform(df)
+ }
+
+// // 等频离散化
+// def equalFrequencyDiscretization(df: DataFrame, inputCol: String, outputCol: String, numBins: Int): DataFrame = {
+// val discretizer = new QuantileDiscretizer()
+// .setInputCol(inputCol)
+// .setOutputCol(outputCol)
+// .setNumBins(numBins)
+// discretizer.fit(df).transform(df)
+// }
+//
+// // 定义一个方法来执行等频离散化
+// def equalFrequencyDiscretization(df: DataFrame, inputCol: String, outputCol: String, numBins: Int ): DataFrame = {
+// // 使用QuantileDiscretizer进行等频离散化
+// val discretizer = new QuantileDiscretizer()
+// .setInputCol(inputCol)
+// .setOutputCol(outputCol)
+// .setNumBins(numBins)
+//
+// val dfNew = discretizer.fit(df).transform(df)
+// dfNew
+// }
+
+ // 等频离散化
+ def equalFrequencyDiscretization(df: DataFrame, inputCol: String, outputCol: String, numBins: Int): DataFrame = {
+ // 创建一个QuantileDiscretizer实例,用于等频离散化
+ val discretizer = new QuantileDiscretizer()
+ .setInputCol(inputCol) // 设置输入列
+ .setOutputCol(outputCol) // 设置输出列
+ .setNumBuckets(numBins) // 设置桶的数量
+
+ // 使用数据来拟合(discretizer.fit)并进行离散化转换(discretizer.transform)
+ val dfNew = discretizer.fit(df).transform(df)
+ dfNew // 返回离散化后的DataFrame
+ }
+
+ // 聚类离散化
+ def kMeansDiscretization(df: DataFrame, inputCol: String, outputCol: String, k: Int): DataFrame = {
+ // 使用KMeans算法将数值列映射到[0, k-1]的整数
+ val assembler = new VectorAssembler()
+ .setInputCols(Array(inputCol))
+ .setOutputCol("features")
+ val vectorizedDF = assembler.transform(df)
+
+ val kmeans = new KMeans()
+ .setK(k)
+ .setSeed(1L)
+ .setFeaturesCol("features")
+ .setPredictionCol(outputCol)
+ val model = kmeans.fit(vectorizedDF)
+
+ val clusteredDF = model.transform(vectorizedDF)
+ clusteredDF.drop("features")
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {}
+
+ def setProperties(map: Map[String, Any]): Unit = {
+ inputCol = MapUtil.get(map, "inputCol").asInstanceOf[String]
+ outputCol = MapUtil.get(map, "outputCol").asInstanceOf[String]
+ method = MapUtil.get(map, "method").asInstanceOf[String]
+ numBins = MapUtil.get(map, "numBins").asInstanceOf[String].toInt
+ k = MapUtil.get(map, "k").asInstanceOf[String].toInt
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+
+ val inputColDescriptor = new PropertyDescriptor()
+ .name("inputCol")
+ .displayName("Input Column")
+ .description("The name of the input column to be discretized.")
+ .defaultValue("")
+ .required(true)
+
+ val outputColDescriptor = new PropertyDescriptor()
+ .name("outputCol")
+ .displayName("Output Column")
+ .description("The name of the output column to store discretized values.")
+ .defaultValue("")
+ .required(true)
+
+ val methodDescriptor = new PropertyDescriptor()
+ .name("method")
+ .displayName("Discretization Method")
+ .description("Choose the discretization method: EqualWidth, EqualFrequency, or KMeans.")
+ .allowableValues(Set("EqualWidth", "EqualFrequency", "KMeans"))
+ .defaultValue("EqualWidth")
+ .required(true)
+
+ val numBinsDescriptor = new PropertyDescriptor()
+ .name("numBins")
+ .displayName("Number of Bins")
+ .description("The number of bins to use for EqualWidth and EqualFrequency methods.")
+ .defaultValue("10")
+ .required(false)
+
+ val kDescriptor = new PropertyDescriptor()
+ .name("k")
+ .displayName("Number of Clusters (KMeans only)")
+ .description("The number of clusters to use for the KMeans method.")
+ .defaultValue("3")
+ .required(false)
+
+ descriptor = inputColDescriptor :: descriptor
+ descriptor = outputColDescriptor :: descriptor
+ descriptor = methodDescriptor :: descriptor
+ descriptor = numBinsDescriptor :: descriptor
+ descriptor = kDescriptor :: descriptor
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ // 返回组件图标
+ ImageUtil.getImage("icon/normalization/DiscretizationNormalization.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.NormalizationGroup)
+ }
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/MaxMinNormalization.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/MaxMinNormalization.scala
new file mode 100644
index 00000000..2b655c73
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/MaxMinNormalization.scala
@@ -0,0 +1,180 @@
+package cn.piflow.bundle.normalization
+
+import cn.piflow._
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class MaxMinNormalization extends ConfigurableStop {
+ // 作者信息
+ val authorEmail: String = "zljxnu@163.com"
+ // 组件描述
+ val description: String = "maximum and minimum value standardization"
+ // 输入端口列表
+ val inportList: List[String] = List(Port.DefaultPort)
+ // 输出端口列表
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ // 定义属性:要标准化的列名
+ var inputCol: String = _
+
+ // 定义属性:输出列名
+ var outputCol: String = _
+
+ // 初始化方法
+ def initialize(ctx: ProcessContext): Unit = {}
+
+ // 执行方法
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ // 获取 SparkSession
+ val spark = pec.get[SparkSession]()
+
+ // 从输入端口读取数据
+ val df = in.read().getSparkDf
+
+ // 计算列的最大值和最小值
+ val max = df.agg(Map(inputCol -> "max")).collect()(0)(0).asInstanceOf[Double]
+ val min = df.agg(Map(inputCol -> "min")).collect()(0)(0).asInstanceOf[Double]
+
+ // 使用公式进行最小-最大值标准化
+ val scaledDf: DataFrame = df.withColumn(outputCol, (df(inputCol) - min) / (max - min))
+
+ // 将标准化后的数据写入输出端口
+ out.write(scaledDf)
+ }
+
+ // 设置属性
+ def setProperties(map: Map[String, Any]): Unit = {
+ inputCol = MapUtil.get(map, "inputCol").asInstanceOf[String]
+ outputCol = MapUtil.get(map, "outputCol").asInstanceOf[String]
+ }
+
+ // 获取属性描述
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val inputCol = new PropertyDescriptor()
+ .name("inputCol")
+ .displayName("输入列名")
+ .description("要进行最小-最大值标准化的列名")
+ .defaultValue("")
+ .required(true)
+
+ val outputCol = new PropertyDescriptor()
+ .name("outputCol")
+ .displayName("Column_Name输出列名")
+ .description("Column names with numerical data to be scaled 标准化后的列名")
+ .defaultValue("")
+ .required(true)
+
+ descriptor = inputCol :: outputCol :: descriptor
+ descriptor
+ }
+
+ // 获取组件图标
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/normalization/MaxMinNormalization.png")
+ }
+
+ // 获取组件所属的组
+ override def getGroup(): List[String] = {
+ List(StopGroup.NormalizationGroup)
+ }
+}
+
+
+//package cn.piflow.bundle.normalization
+//
+//import cn.piflow.bundle.util.CleanUtil
+//import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+//import cn.piflow.conf._
+//import cn.piflow.conf.bean.PropertyDescriptor
+//import cn.piflow.conf.util.{ImageUtil, MapUtil}
+//import org.apache.spark.sql.SparkSession
+//
+//class MaxMinNormalization extends ConfigurableStop {
+//
+// // 作者邮箱
+// val authorEmail: String = "zljxnu@163.com"
+// // 描述
+// val description: String = "MinMax scaling for numerical data"
+// // 输入端口列表
+// val inportList: List[String] = List(Port.DefaultPort)
+// // 输出端口列表
+// val outportList: List[String] = List(Port.DefaultPort)
+//
+// // 需要标准化的列名,从属性中设置
+// var columnName: String = _
+//
+// // 执行标准化操作
+// def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+// val spark = pec.get[SparkSession]()
+// val sqlContext = spark.sqlContext
+// // 读取输入数据
+// val dfOld = in.read().getSparkDf
+// // 将输入数据创建为临时表
+// dfOld.createOrReplaceTempView("data")
+// // 解析需要标准化的列名
+// val columnNames = columnName.split(",").toSet
+//
+// val sqlNewFieldStr = new StringBuilder
+// // 针对每个指定的列名,生成标准化的 SQL 代码
+// columnNames.foreach(c => {
+// sqlNewFieldStr ++= ",((("
+// sqlNewFieldStr ++= c
+// sqlNewFieldStr ++= " - min("
+// sqlNewFieldStr ++= c
+// sqlNewFieldStr ++= ")) / (max("
+// sqlNewFieldStr ++= c
+// sqlNewFieldStr ++= ") - min("
+// sqlNewFieldStr ++= c
+// sqlNewFieldStr ++= "))) as "
+// sqlNewFieldStr ++= c
+// sqlNewFieldStr ++= "_scaled "
+// })
+//
+// // 构建最终的 SQL 查询文本
+// val sqlText: String = "select * " + sqlNewFieldStr + " from data"
+//
+// // 执行 SQL 查询,得到标准化后的 DataFrame
+// val dfNew = sqlContext.sql(sqlText)
+// dfNew.createOrReplaceTempView("scaled_data")
+//
+// // 将标准化后的数据写入输出
+// out.write(dfNew)
+// }
+//
+// // 初始化方法
+// def initialize(ctx: ProcessContext): Unit = {}
+//
+// // 设置属性
+// def setProperties(map: Map[String, Any]): Unit = {
+// // 从属性映射中获取需要标准化的列名
+// columnName = MapUtil.get(map, key = "columnName").asInstanceOf[String]
+// }
+//
+// // 定义属性描述符
+// override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+// var descriptor: List[PropertyDescriptor] = List()
+// val columnNameDesc = new PropertyDescriptor()
+// .name("columnName")
+// .displayName("Column_Name")
+// .description("Column names with numerical data to be scaled (comma-separated)")
+// .defaultValue("")
+// .required(true)
+// .example("feature1,feature2")
+//
+// descriptor = columnNameDesc :: descriptor
+// descriptor
+// }
+//
+// // 获取图标
+// override def getIcon(): Array[Byte] = {
+// ImageUtil.getImage("icon/normalization/MaxMinNormalization.png")
+// }
+//
+// // 获取所属组
+// override def getGroup(): List[String] = {
+// List(StopGroup.NormalizationGroup)
+// }
+//}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ScopeNormalization.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ScopeNormalization.scala
new file mode 100644
index 00000000..902e7ea0
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ScopeNormalization.scala
@@ -0,0 +1,114 @@
+package cn.piflow.bundle.normalization
+
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class ScopeNormalization extends ConfigurableStop {
+
+ // 组件的作者信息
+ val authorEmail: String = "zljxnu@163.com"
+ // 组件的描述信息
+ val description: String = "Scope standardization"
+ // 定义输入端口
+ val inportList: List[String] = List(Port.DefaultPort)
+ // 定义输出端口
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ // 定义输入列名称
+ var inputCol: String = _
+ // 定义输出列名称
+ var outputCol: String = _
+ // 定义目标范围 [a, b]
+ var range: (Double, Double) = (0.0, 1.0)
+
+ // 实际的数据处理逻辑
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ // 获取SparkSession
+ val spark = pec.get[SparkSession]()
+
+ // 读取输入数据
+ val dfOld = in.read().getSparkDf
+
+ // 使用范围映射公式进行数据处理
+ val dfNew = mapToRange(dfOld, inputCol, outputCol, range)
+
+ // 将处理后的数据写出
+ out.write(dfNew)
+ }
+
+ // 初始化方法
+ def initialize(ctx: ProcessContext): Unit = {}
+
+ // 设置组件属性
+ def setProperties(map: Map[String, Any]): Unit = {
+ inputCol = MapUtil.get(map, key = "inputCol").asInstanceOf[String]
+ outputCol = MapUtil.get(map, key = "outputCol").asInstanceOf[String]
+ val values = MapUtil.get(map, key = "range").asInstanceOf[String].stripPrefix("(").stripSuffix(")").split(",").map(_.toDouble)
+ range = (values(0), values(1))
+
+//// range = MapUtil.get(map, key = "range").asInstanceOf[(Double, Double)]
+// //把string解析成元组映射给range
+// val jsonString: String = MapUtil.get(map, key = "range").asInstanceOf[String]
+// // 移除括号并分割字符串
+// val values = jsonString.stripPrefix("(").stripSuffix(")").split(",").map(_.toDouble)
+// // 创建 Scala 元组
+// val range: (Double, Double) = (values(0), values(1))
+
+
+
+ }
+
+ // 定义组件的属性描述
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val inputCol = new PropertyDescriptor()
+ .name("inputCol")
+ .displayName("Input Column")
+ .description("要映射的输入列的名称")
+ .defaultValue("")
+ .required(true)
+ .example("input_data")
+
+ val outputCol = new PropertyDescriptor()
+ .name("outputCol")
+ .displayName("Output Column")
+ .description("映射后的输出列的名称")
+ .defaultValue("")
+ .required(true)
+ .example("normalized_data")
+
+ val range = new PropertyDescriptor()
+ .name("range")
+ .displayName("Range")
+ .description("目标范围 [a, b],以元组的形式表示")
+ .defaultValue("")
+ .required(true)
+ .example("(0.0, 1.0)")
+
+ descriptor = inputCol :: outputCol :: range :: descriptor
+ descriptor
+ }
+
+ // 定义组件的图标(可选)
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/normalization/ScopeNormalization.png")
+ }
+
+ // 定义组件所属的分组(可选)
+ override def getGroup(): List[String] = {
+ List(StopGroup.NormalizationGroup)
+ }
+
+ // 实现范围映射的方法
+ private def mapToRange(df: DataFrame, inputCol: String, outputCol: String, range: (Double, Double)): DataFrame = {
+ // 使用Spark SQL的functions库来进行数据处理
+ val min = df.agg(Map(inputCol -> "min")).collect()(0)(0).asInstanceOf[Double]
+ val max = df.agg(Map(inputCol -> "max")).collect()(0)(0).asInstanceOf[Double]
+ val dfNew = df.withColumn(outputCol, (col(inputCol) - min) / (max - min) * (range._2 - range._1) + range._1)
+ dfNew
+ }
+}
\ No newline at end of file
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ZScore.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ZScore.scala
new file mode 100644
index 00000000..43887ba3
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/normalization/ZScore.scala
@@ -0,0 +1,91 @@
+package cn.piflow.bundle.normalization
+
+import cn.piflow.conf._
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.functions._
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class ZScore extends ConfigurableStop {
+
+ // 作者邮箱
+ val authorEmail: String = "zljxnu@163.cn"
+ // 描述
+ val description: String = "ZScore standardization"
+ // 输入端口
+ val inportList: List[String] = List(Port.DefaultPort)
+ // 输出端口
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ // 输入列名称
+ var inputCols: String = _
+ // 输出列名称
+ var outputCols: String = _
+
+ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+ val df = in.read().getSparkDf
+
+ // 将逗号分隔的输入和输出列名称拆分为列表
+ val inputColList = inputCols.split(",").map(_.trim)
+ val outputColList = outputCols.split(",").map(_.trim)
+
+ // 计算均值和标准差
+ val stats = inputColList.foldLeft(df) {
+ case (currentDf, inputCol) =>
+ val mean = currentDf.select(avg(col(inputCol))).first().getDouble(0)
+ val stdDev = currentDf.select(stddev(col(inputCol))).first().getDouble(0)
+ // 创建一个新列名:{inputCol}_zscore
+ val zScoreCol = s"${inputCol}_zscore"
+
+ // 使用公式进行 z-score 标准化
+ currentDf.withColumn(zScoreCol, (col(inputCol) - mean) / stdDev)
+ }
+
+ // 重命名输出列以匹配原始列名称
+ val finalDf = inputColList.zip(outputColList).foldLeft(stats) {
+ case (currentDf, (inputCol, outputCol)) =>
+ currentDf.withColumnRenamed(s"${inputCol}_zscore", outputCol)
+ }
+
+ out.write(finalDf)
+ }
+
+ def initialize(ctx: ProcessContext): Unit = {}
+
+ def setProperties(map: Map[String, Any]): Unit = {
+ inputCols = MapUtil.get(map, key = "inputCols").asInstanceOf[String]
+ outputCols = MapUtil.get(map, key = "outputCols").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val inputCols = new PropertyDescriptor()
+ .name("inputCols")
+ .displayName("输入列")
+ .description("要标准化的列,用逗号分隔。")
+ .defaultValue("")
+ .required(true)
+ .example("特征1, 特征2")
+
+ val outputCols = new PropertyDescriptor()
+ .name("outputCols")
+ .displayName("输出列")
+ .description("用于存储标准化值的相应输出列,用逗号分隔。")
+ .defaultValue("")
+ .required(true)
+ .example("标准化特征1, 标准化特征2")
+
+ descriptor = inputCols :: outputCols :: descriptor
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/normalization/ZScoreNormalization.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List(StopGroup.NormalizationGroup)
+ }
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseRead.scala
index ecc0631c..1b84c931 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseRead.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -32,7 +33,7 @@ class OceanBaseRead extends ConfigurableStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseWrite.scala
index ff92191f..38fe8c06 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/oceanbase/OceanBaseWrite.scala
@@ -30,7 +30,7 @@ class OceanBaseWrite extends ConfigurableStop{
properties.put("password", password)
properties.put("driver",driver)
properties.put("isolationLevel","NONE") //if not set this value, throw expection
- val df = in.read()
+ val df = in.read().getSparkDf
df.write.mode(SaveMode.Append).jdbc(url,dbtable,properties)
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/openLooKeng/OpenLooKengRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/openLooKeng/OpenLooKengRead.scala
index 5004dcfb..27903873 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/openLooKeng/OpenLooKengRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/openLooKeng/OpenLooKengRead.scala
@@ -4,6 +4,7 @@ import cn.piflow._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.sql.SparkSession
@@ -32,7 +33,7 @@ class OpenLooKengRead extends ConfigurableStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/rdf/RdfToDF.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/rdf/RdfToDF.scala
index b3bb3eee..20bea70f 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/rdf/RdfToDF.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/rdf/RdfToDF.scala
@@ -10,7 +10,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataTypes, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class RdfToDF extends ConfigurableStop{
override val authorEmail: String = "xiaomeng7890@gmail.com"
@@ -162,7 +162,7 @@ class RdfToDF extends ConfigurableStop{
//in
if (isFront == "true") {
val inDF : Array[String] = in
- .read()
+ .read().getSparkDf
.collect()
.map(r => r.getAs[String](1))
var index = 0
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/ReadFromRedis.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/ReadFromRedis.scala
index 439c8046..b5e8c7bb 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/ReadFromRedis.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/ReadFromRedis.scala
@@ -2,12 +2,12 @@ package cn.piflow.bundle.redis
import java.util
-
import cn.piflow.bundle.util.{JedisClusterImplSer, RedisUtil}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.avro.generic.GenericData.StringType
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{DataType, StructField, StructType}
@@ -33,7 +33,7 @@ class ReadFromRedis extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- var dfIn=in.read()
+ var dfIn=in.read().getSparkDf
var colName=column_name
//connect to redis
@@ -57,7 +57,7 @@ class ReadFromRedis extends ConfigurableStop{
Row.fromSeq(row.toArray.toSeq)
})
val df=spark.createDataFrame(newRDD,dfSchema)
- out.write(df)
+ out.write(new SciDataFrame(df))
}
def initialize(ctx: ProcessContext): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/WriteToRedis.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/WriteToRedis.scala
index 1cf362af..aaa3a9f1 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/WriteToRedis.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/redis/WriteToRedis.scala
@@ -23,7 +23,7 @@ class WriteToRedis extends ConfigurableStop{
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val df = in.read()
+ val df = in.read().getSparkDf
var col_name:String=column_name
df.printSchema()
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DataFrameRowParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DataFrameRowParser.scala
index af1f29c1..8ad612b5 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DataFrameRowParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DataFrameRowParser.scala
@@ -3,6 +3,7 @@ package cn.piflow.bundle.script
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf._
+import cn.piflow.util.SciDataFrame
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
@@ -46,7 +47,7 @@ class DataFrameRowParser extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
- val inDF = in.read()
+ val inDF = in.read().getSparkDf
//parse RDD
val rdd = inDF.rdd.map(row => {
@@ -65,7 +66,7 @@ class DataFrameRowParser extends ConfigurableStop{
//create DataFrame
val df = spark.createDataFrame(rdd,schemaStructType)
//df.show()
- out.write(df)
+ out.write(new SciDataFrame(df))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DockerExecute.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DockerExecute.scala
index a50e84dd..e877f3e9 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DockerExecute.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/DockerExecute.scala
@@ -6,20 +6,20 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.util.PropertyUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{SaveMode, SparkSession}
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
-
-class DockerExecute extends ConfigurableStop{
+class DockerExecute extends ConfigurableStop {
val authorEmail: String = "ygang@cnic.cn"
val description: String = "docker runs Python"
val inportList: List[String] = List(Port.AnyPort)
val outportList: List[String] = List(Port.AnyPort)
- var outports : List[String] = _
- var inports : List[String] = _
+ var outports: List[String] = _
+ var inports: List[String] = _
- var ymlContent:String =_
+ var ymlContent: String = _
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
@@ -34,37 +34,44 @@ class DockerExecute extends ConfigurableStop{
ymlContent = ymlContent.replace("piflow_hdfs_url", PropertyUtil.getPropertyValue("hdfs.web.url"))
.replace("piflow_extra_hosts", stringBuffer.toString)
+ val embedModelsPath = PropertyUtil.getPropertyValue("embed_models_path")
+ if (embedModelsPath != null) {
+ ymlContent = ymlContent.replace("embed_models_path", embedModelsPath)
+ }
println(ymlContent)
-
+
DockerStreamUtil.execRuntime("mkdir app")
- val ymlName = uuid+".yml"
+ val ymlName = uuid + ".yml"
println("执行命令:=============================执行创建app文件夹命令=================")
DockerStreamUtil.execRuntime(s"echo '${ymlContent}'> app/${ymlName}")
- val dockerShellString=s"docker-compose -f app/${ymlName} up"
- val dockerDownShellString=s"docker-compose -f app/${ymlName} down"
+ val dockerShellString = s"docker-compose -f app/${ymlName} up --force-recreate"
+ val dockerDownShellString = s"docker-compose -f app/${ymlName} down"
val inputPath = "/piflow/docker/" + appID + s"/inport_${uuid}/"
val outputPath = "/piflow/docker/" + appID + s"/outport_${uuid}/"
val inputPathStringBuffer = new StringBuffer()
- if(!(inports.contains("Default") || inports.contains("DefaultPort"))){
+ if (!(inports.contains("Default") || inports.contains("DefaultPort"))) {
inports.foreach(x => {
- println("输入端口:============================="+x+"=================")
+ println("输入端口:=============================" + x + "=================")
val hdfsSavePath = inputPath + x
inputPathStringBuffer.append(hdfsSavePath + ",")
- in.read(x).write.format("csv").mode("overwrite")
- .option("delimiter", "\t")
- .option("header", true).save(hdfsSavePath)
+ // in.read(x).write.format("csv").mode("overwrite")
+ // .option("delimiter", "\t")
+ // .option("header", true).save(hdfsSavePath)
+ in.read(x).getSparkDf.write
+ .mode("overwrite") // 指定写入模式,这里是覆盖已存在的文件
+ .parquet(hdfsSavePath)
})
println("执行命令:======================输入路径写入app/inputPath.txt 文件========================")
DockerStreamUtil.execRuntime(s"echo ${inputPath}> app/inputPath.txt")
}
- if(!(outports.contains("Default") || outports.contains("DefaultPort"))){
+ if (!(outports.contains("Default") || outports.contains("DefaultPort"))) {
println("执行命令:======================输出路径写入app/outputPath.txt 文件========================")
DockerStreamUtil.execRuntime(s"echo ${outputPath}> app/outputPath.txt")
}
@@ -72,9 +79,9 @@ class DockerExecute extends ConfigurableStop{
println("执行命令:======================创建镜像命令========================")
DockerStreamUtil.execRuntime(dockerShellString)
- if(!(outports.contains("Default") || outports.contains("DefaultPort"))){
+ if (!(outports.contains("Default") || outports.contains("DefaultPort"))) {
outports.foreach(x => {
- println("输出端口:============================="+x+"=================")
+ println("输出端口:=============================" + x + "=================")
val outDF = spark.read.format("csv")
.option("header", true)
.option("mode", "FAILFAST")
@@ -92,7 +99,7 @@ class DockerExecute extends ConfigurableStop{
val inportStr = MapUtil.get(map, "inports").asInstanceOf[String]
inports = inportStr.split(",").map(x => x.trim).toList
- ymlContent =MapUtil.get(map, key = "ymlContent").asInstanceOf[String]
+ ymlContent = MapUtil.get(map, key = "ymlContent").asInstanceOf[String]
}
@@ -102,7 +109,7 @@ class DockerExecute extends ConfigurableStop{
override def getPropertyDescriptor(): List[PropertyDescriptor] = {
- var descriptor : List[PropertyDescriptor] = List()
+ var descriptor: List[PropertyDescriptor] = List()
val inports = new PropertyDescriptor()
.name("inports")
.displayName("Inports")
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecutePythonWithDataFrame.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecutePythonWithDataFrame.scala
index acf7fa87..01b32e84 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecutePythonWithDataFrame.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecutePythonWithDataFrame.scala
@@ -13,7 +13,7 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.JavaConversions._
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
/**
* Created by xjzhu@cnic.cn on 2/24/20
*/
@@ -64,7 +64,7 @@ class ExecutePythonWithDataFrame extends ConfigurableStop{
val spark = pec.get[SparkSession]()
- val df = in.read()
+ val df = in.read().getSparkDf
val jep = new Jep()
val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecuteScala.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecuteScala.scala
index 59e62b93..9e8e026d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecuteScala.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/ExecuteScala.scala
@@ -42,10 +42,10 @@ class ExecuteScala extends ConfigurableStop{
val script = new PropertyDescriptor()
.name("script")
.displayName("script")
- .description("The code of scala. \nUse in.read() to get dataframe from upstream component. \nUse out.write() to write datafram to downstream component.")
+ .description("The code of scala. \nUse in.read().getSparkDf to get dataframe from upstream component. \nUse out.write() to write datafram to downstream component.")
.defaultValue("")
.required(true)
- .example("val df = in.read() \nval df1 = df.select(\"author\").filter($\"author\".like(\"%xjzhu%\")) \ndf1.show() \ndf.createOrReplaceTempView(\"person\") \nval df2 = spark.sql(\"select * from person where author like '%xjzhu%'\") \ndf2.show() \nout.write(df2)")
+ .example("val df = in.read().getSparkDf \nval df1 = df.select(\"author\").filter($\"author\".like(\"%xjzhu%\")) \ndf1.show() \ndf.createOrReplaceTempView(\"person\") \nval df2 = spark.sql(\"select * from person where author like '%xjzhu%'\") \ndf2.show() \nout.write(df2)")
.language(Language.Scala)
descriptor = script :: descriptor
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonExecutor.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonExecutor.scala
index 95c07fcc..7010d35e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonExecutor.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonExecutor.scala
@@ -5,7 +5,7 @@ import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
-import cn.piflow.util.{FileUtil, PropertyUtil, PythonScriptUtil}
+import cn.piflow.util.{FileUtil, PropertyUtil, PythonScriptUtil, SciDataFrame}
import org.apache.spark.SparkFiles
import org.apache.spark.deploy.PythonRunner
import org.apache.spark.sql.SparkSession
@@ -82,7 +82,7 @@ class PythonExecutor extends ConfigurableStop{
val inputPath = "/piflow/python/" + appID + "/inport/default/"
var outputPath = "/piflow/python/" + appID + "/outport/default/"
- val df = in.read()
+ val df = in.read().getSparkDf
df.write.format("csv").mode("overwrite").option("set","\t").save(inputPath)
PythonRunner.main(Array(pyFilePath, pyFiles, "-i " + inputPath, "-o " + outputPath))
@@ -93,7 +93,7 @@ class PythonExecutor extends ConfigurableStop{
.option("mode","FAILFAST")
.load(outputPath)
outDF.show()
- out.write(outDF)
+ out.write(new SciDataFrame(outDF))
}
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonRun.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonRun.scala
index 3532d099..8f97ad76 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonRun.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/script/PythonRun.scala
@@ -8,7 +8,7 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.deploy.PythonRunner
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class PythonRun extends ConfigurableStop{
override val authorEmail: String = ""
override val description: String = ""
@@ -60,7 +60,7 @@ class PythonRun extends ConfigurableStop{
val inputPath = "/piflow/python/" + ID + "/inport/default/"
var outputPath = "/piflow/python/" + ID + "/outport/default/"
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.write.format("csv").mode("overwrite").option("set","\t").save(inputPath)
PythonRunner.main(Array(pyPath, pyFileshelp, "-i " + inputPath, "-o " + outputPath))
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/GetFromSolr.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/GetFromSolr.scala
index 99572ab0..676a7b4e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/GetFromSolr.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/GetFromSolr.scala
@@ -17,7 +17,7 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ListBuffer
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class GetFromSolr extends ConfigurableStop{
override val authorEmail: String ="yangqidong@cnic.cn"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/PutIntoSolr.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/PutIntoSolr.scala
index 31ecf084..9f9e215b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/PutIntoSolr.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/solr/PutIntoSolr.scala
@@ -31,7 +31,7 @@ class PutIntoSolr extends ConfigurableStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val df: DataFrame = in.read()
+ val df: DataFrame = in.read().getSparkDf
val SchemaList: List[StructField] = df.schema.toList
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/streaming/SocketTextStream.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/streaming/SocketTextStream.scala
index 82970e48..3fe9923e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/streaming/SocketTextStream.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/streaming/SocketTextStream.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver}
import org.apache.spark.streaming.{Seconds, StreamingContext}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class SocketTextStream extends ConfigurableStreamingStop {
override val authorEmail: String = "xjzhu@cnic.cn"
override val description: String = "Receive text data from socket"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbRead.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbRead.scala
index 5625d893..cddbd99d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbRead.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbRead.scala
@@ -5,7 +5,8 @@ import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Language, Port, StopGroup}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+import cn.piflow.util.SciDataFrame
class TidbRead extends ConfigurableStop{
override val authorEmail: String = "llei@cnic.com"
@@ -32,7 +33,7 @@ class TidbRead extends ConfigurableStop{
.option("password",password)
.load()
- out.write(jdbcDF)
+ out.write(new SciDataFrame(jdbcDF))
}
override def setProperties(map: Map[String, Any]): Unit = {
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbWrite.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbWrite.scala
index c6a93c54..9bb55b39 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbWrite.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/tidb/TidbWrite.scala
@@ -30,7 +30,7 @@ class TidbWrite extends ConfigurableStop{
properties.put("password", password)
properties.put("driver",driver)
properties.put("isolationLevel","NONE") //if not set this value, throw expection
- val df = in.read()
+ val df = in.read().getSparkDf
df.write.mode(SaveMode.Append).jdbc(url,dbtable,properties)
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/DocxParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/DocxParser.scala
new file mode 100644
index 00000000..0e44a1d9
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/DocxParser.scala
@@ -0,0 +1,145 @@
+package cn.piflow.bundle.unstructured
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil, ProcessUtil}
+import cn.piflow.conf.{ConfigurableStop, Port}
+import cn.piflow.util.{SciDataFrame, UnstructuredUtils}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import com.alibaba.fastjson2.{JSON, JSONArray}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class DocxParser extends ConfigurableStop {
+ val authorEmail: String = "tianyao@cnic.cn"
+ val description: String = "parse docx to structured data."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var fileSource: String = _
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ val unstructuredHost: String = UnstructuredUtils.unstructuredHost()
+ val unstructuredPort: String = UnstructuredUtils.unstructuredPort()
+ if (unstructuredHost == null || unstructuredHost.isEmpty) {
+ println("########## Exception: can not parse, unstructured host is null!!!")
+ throw new Exception("########## Exception: can not parse, unstructured host is null!!!")
+ } else if ("127.0.0.1".equals(unstructuredHost) || "localhost".equals(unstructuredHost)) {
+ println("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ throw new Exception("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ }
+ var localDir = ""
+ if ("hdfs".equals(fileSource)) {
+ //Download the file to the location,
+ localDir = UnstructuredUtils.downloadFilesFromHdfs(filePath)
+ }
+
+ //Create a mutable ArrayBuffer to store the parameters of the curl command
+ println("curl start==========================================================================")
+ val curlCommandParams = new ArrayBuffer[String]()
+ curlCommandParams += "curl"
+ curlCommandParams += "-X"
+ curlCommandParams += "POST"
+ curlCommandParams += s"$unstructuredHost:$unstructuredPort/general/v0/general"
+ curlCommandParams += "-H"
+ curlCommandParams += "accept: application/json"
+ curlCommandParams += "-H"
+ curlCommandParams += "Content-Type: multipart/form-data"
+ var fileListSize = 0;
+ if ("hdfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(localDir)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ if ("nfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(filePath)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ val (output, error): (String, String) = ProcessUtil.executeCommand(curlCommandParams.toSeq)
+ if (output.nonEmpty) {
+ // println(output)
+ import spark.implicits._
+ if (fileListSize > 1) {
+ val array: JSONArray = JSON.parseArray(output)
+ var combinedDF: DataFrame = null
+ array.forEach {
+ o =>
+ val jsonString = o.toString
+ val df = spark.read.json(Seq(jsonString).toDS)
+ if (combinedDF == null) {
+ combinedDF = df
+ } else {
+ combinedDF = combinedDF.union(df)
+ }
+ }
+ combinedDF.show(10)
+ out.write(combinedDF)
+ } else {
+ val df = spark.read.json(Seq(output).toDS())
+ df.show(10)
+ out.write(new SciDataFrame(df))
+ }
+ } else {
+ println(s"########## Exception: $error")
+ throw new Exception(s"########## Exception: $error")
+ }
+ //delete local temp file
+ if ("hdfs".equals(fileSource)) {
+ UnstructuredUtils.deleteTempFiles(localDir)
+ }
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ filePath = MapUtil.get(map, "filePath").asInstanceOf[String]
+ fileSource = MapUtil.get(map, "fileSource").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of the file(.docx)")
+ .defaultValue("/test/test.docx")
+ .required(true)
+ .example("/test/test.docx")
+ descriptor = descriptor :+ filePath
+
+ val fileSource = new PropertyDescriptor()
+ .name("fileSource")
+ .displayName("FileSource")
+ .description("The source of the file ")
+ .defaultValue("hdfs")
+ .allowableValues(Set("hdfs", "nfs"))
+ .required(true)
+ .example("hdfs")
+ descriptor = descriptor :+ fileSource
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/unstructured/DocxParser.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List("unstructured")
+ }
+
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/HtmlParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/HtmlParser.scala
new file mode 100644
index 00000000..dcfbc783
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/HtmlParser.scala
@@ -0,0 +1,145 @@
+package cn.piflow.bundle.unstructured
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil, ProcessUtil}
+import cn.piflow.conf.{ConfigurableStop, Port}
+import cn.piflow.util.{SciDataFrame, UnstructuredUtils}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import com.alibaba.fastjson2.{JSON, JSONArray}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class HtmlParser extends ConfigurableStop {
+ val authorEmail: String = "tianyao@cnic.cn"
+ val description: String = "parse html to structured data."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var fileSource: String = _
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ val unstructuredHost: String = UnstructuredUtils.unstructuredHost()
+ val unstructuredPort: String = UnstructuredUtils.unstructuredPort()
+ if (unstructuredHost == null || unstructuredHost.isEmpty) {
+ println("########## Exception: can not parse, unstructured host is null!!!")
+ throw new Exception("########## Exception: can not parse, unstructured host is null!!!")
+ } else if ("127.0.0.1".equals(unstructuredHost) || "localhost".equals(unstructuredHost)) {
+ println("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ throw new Exception("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ }
+ var localDir = ""
+ if ("hdfs".equals(fileSource)) {
+ //Download the file to the location,
+ localDir = UnstructuredUtils.downloadFilesFromHdfs(filePath)
+ }
+
+ //Create a mutable ArrayBuffer to store the parameters of the curl command
+ println("curl start==========================================================================")
+ val curlCommandParams = new ArrayBuffer[String]()
+ curlCommandParams += "curl"
+ curlCommandParams += "-X"
+ curlCommandParams += "POST"
+ curlCommandParams += s"$unstructuredHost:$unstructuredPort/general/v0/general"
+ curlCommandParams += "-H"
+ curlCommandParams += "accept: application/json"
+ curlCommandParams += "-H"
+ curlCommandParams += "Content-Type: multipart/form-data"
+ var fileListSize = 0;
+ if ("hdfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(localDir)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ if ("nfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(filePath)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ val (output, error): (String, String) = ProcessUtil.executeCommand(curlCommandParams.toSeq)
+ if (output.nonEmpty) {
+ // println(output)
+ import spark.implicits._
+ if (fileListSize > 1) {
+ val array: JSONArray = JSON.parseArray(output)
+ var combinedDF: DataFrame = null
+ array.forEach {
+ o =>
+ val jsonString = o.toString
+ val df = spark.read.json(Seq(jsonString).toDS)
+ if (combinedDF == null) {
+ combinedDF = df
+ } else {
+ combinedDF = combinedDF.union(df)
+ }
+ }
+ combinedDF.show(10)
+ out.write(combinedDF)
+ } else {
+ val df = spark.read.json(Seq(output).toDS())
+ df.show(10)
+ out.write(new SciDataFrame(df))
+ }
+ } else {
+ println(s"########## Exception: $error")
+ throw new Exception(s"########## Exception: $error")
+ }
+ //delete local temp file
+ if ("hdfs".equals(fileSource)) {
+ UnstructuredUtils.deleteTempFiles(localDir)
+ }
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ filePath = MapUtil.get(map, "filePath").asInstanceOf[String]
+ fileSource = MapUtil.get(map, "fileSource").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of the file(.html/.htm)")
+ .defaultValue("/test/test.html")
+ .required(true)
+ .example("/test/test.html")
+ descriptor = descriptor :+ filePath
+
+ val fileSource = new PropertyDescriptor()
+ .name("fileSource")
+ .displayName("FileSource")
+ .description("The source of the file ")
+ .defaultValue("hdfs")
+ .allowableValues(Set("hdfs", "nfs"))
+ .required(true)
+ .example("hdfs")
+ descriptor = descriptor :+ fileSource
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/unstructured/HtmlParser.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List("unstructured")
+ }
+
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/ImageParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/ImageParser.scala
new file mode 100644
index 00000000..7560e503
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/ImageParser.scala
@@ -0,0 +1,162 @@
+package cn.piflow.bundle.unstructured
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil, ProcessUtil}
+import cn.piflow.conf.{ConfigurableStop, Port}
+import cn.piflow.util.{SciDataFrame, UnstructuredUtils}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import com.alibaba.fastjson2.{JSON, JSONArray}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class ImageParser extends ConfigurableStop {
+ val authorEmail: String = "tianyao@cnic.cn"
+ val description: String = "parse image to structured data."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var fileSource: String = _
+ var strategy: String = _
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ val unstructuredHost: String = UnstructuredUtils.unstructuredHost()
+ val unstructuredPort: String = UnstructuredUtils.unstructuredPort()
+ if (unstructuredHost == null || unstructuredHost.isEmpty) {
+ println("########## Exception: can not parse, unstructured host is null!!!")
+ throw new Exception("########## Exception: can not parse, unstructured host is null!!!")
+ } else if ("127.0.0.1".equals(unstructuredHost) || "localhost".equals(unstructuredHost)) {
+ println("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ throw new Exception("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ }
+ var localDir = ""
+ if ("hdfs".equals(fileSource)) {
+ //Download the file to the location,
+ localDir = UnstructuredUtils.downloadFilesFromHdfs(filePath)
+ }
+
+ //Create a mutable ArrayBuffer to store the parameters of the curl command
+ println("curl start==========================================================================")
+ val curlCommandParams = new ArrayBuffer[String]()
+ curlCommandParams += "curl"
+ curlCommandParams += "-X"
+ curlCommandParams += "POST"
+ curlCommandParams += s"$unstructuredHost:$unstructuredPort/general/v0/general"
+ curlCommandParams += "-H"
+ curlCommandParams += "accept: application/json"
+ curlCommandParams += "-H"
+ curlCommandParams += "Content-Type: multipart/form-data"
+ curlCommandParams += "-F"
+ curlCommandParams += "pdf_infer_table_structure=false"
+ curlCommandParams += "-F"
+ curlCommandParams += s"strategy=$strategy"
+ curlCommandParams += "-F"
+ curlCommandParams += "hi_res_model_name=detectron2_lp"
+ var fileListSize = 0;
+ if ("hdfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(localDir)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ if ("nfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(filePath)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ val (output, error): (String, String) = ProcessUtil.executeCommand(curlCommandParams.toSeq)
+ if (output.nonEmpty) {
+ // println(output)
+ import spark.implicits._
+ if (fileListSize > 1) {
+ val array: JSONArray = JSON.parseArray(output)
+ var combinedDF: DataFrame = null
+ array.forEach {
+ o =>
+ val jsonString = o.toString
+ val df = spark.read.json(Seq(jsonString).toDS)
+ if (combinedDF == null) {
+ combinedDF = df
+ } else {
+ combinedDF = combinedDF.union(df)
+ }
+ }
+ combinedDF.show(10)
+ out.write(combinedDF)
+ } else {
+ val df = spark.read.json(Seq(output).toDS())
+ df.show(10)
+ out.write(new SciDataFrame(df))
+ }
+ } else {
+ println(s"########## Exception: $error")
+ throw new Exception(s"########## Exception: $error")
+ }
+ //delete local temp file
+ if ("hdfs".equals(fileSource)) {
+ UnstructuredUtils.deleteTempFiles(localDir)
+ }
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ filePath = MapUtil.get(map, "filePath").asInstanceOf[String]
+ fileSource = MapUtil.get(map, "fileSource").asInstanceOf[String]
+ strategy = MapUtil.get(map, "strategy").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of the file(.png/.jpg/.jpeg/.tiff/.bmp/.heic)")
+ .defaultValue("/test/test.png")
+ .required(true)
+ .example("/test/test.png")
+ descriptor = descriptor :+ filePath
+
+ val fileSource = new PropertyDescriptor()
+ .name("fileSource")
+ .displayName("FileSource")
+ .description("The source of the file ")
+ .defaultValue("hdfs")
+ .allowableValues(Set("hdfs", "nfs"))
+ .required(true)
+ .example("hdfs")
+ descriptor = descriptor :+ fileSource
+
+ val strategy = new PropertyDescriptor()
+ .name("strategy")
+ .displayName("strategy")
+ .description("The method the method that will be used to process the file ")
+ .defaultValue("ocr_only")
+ .allowableValues(Set("ocr_only"))
+ .required(true)
+ .example("ocr_only")
+ descriptor = descriptor :+ strategy
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/unstructured/ImageParser.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List("unstructured")
+ }
+
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PdfParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PdfParser.scala
new file mode 100644
index 00000000..9edbdfdc
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PdfParser.scala
@@ -0,0 +1,165 @@
+package cn.piflow.bundle.unstructured
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil, ProcessUtil}
+import cn.piflow.conf.{ConfigurableStop, Port}
+import cn.piflow.util.{SciDataFrame, UnstructuredUtils}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import com.alibaba.fastjson2.{JSON, JSONArray}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+class PdfParser extends ConfigurableStop {
+ val authorEmail: String = "tianyao@cnic.cn"
+ val description: String = "parse pdf to structured data."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var fileSource: String = _
+ var strategy: String = _
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ val unstructuredHost: String = UnstructuredUtils.unstructuredHost()
+ val unstructuredPort: String = UnstructuredUtils.unstructuredPort()
+ if (unstructuredHost == null || unstructuredHost.isEmpty) {
+ println("########## Exception: can not parse, unstructured host is null!!!")
+ throw new Exception("########## Exception: can not parse, unstructured host is null!!!")
+ } else if ("127.0.0.1".equals(unstructuredHost) || "localhost".equals(unstructuredHost)) {
+ println("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ throw new Exception("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ }
+ var localDir = ""
+ if ("hdfs".equals(fileSource)) {
+ //Download the file to the location,
+ localDir = UnstructuredUtils.downloadFilesFromHdfs(filePath)
+ }
+
+ //Create a mutable ArrayBuffer to store the parameters of the curl command
+ println("curl start==========================================================================")
+ val curlCommandParams = new ArrayBuffer[String]()
+ curlCommandParams += "curl"
+ curlCommandParams += "-X"
+ curlCommandParams += "POST"
+ curlCommandParams += s"$unstructuredHost:$unstructuredPort/general/v0/general"
+ curlCommandParams += "-H"
+ curlCommandParams += "accept: application/json"
+ curlCommandParams += "-H"
+ curlCommandParams += "Content-Type: multipart/form-data"
+ curlCommandParams += "-F"
+ curlCommandParams += "pdf_infer_table_structure=false"
+ curlCommandParams += "-F"
+ curlCommandParams += s"strategy=$strategy"
+ curlCommandParams += "-F"
+ curlCommandParams += "hi_res_model_name=detectron2_lp"
+ var fileListSize = 0;
+ if ("hdfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(localDir)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ println(s"local path:$path")
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ if ("nfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(filePath)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ println(s"local path:$path")
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ val (output, error): (String, String) = ProcessUtil.executeCommand(curlCommandParams.toSeq)
+ if (output.nonEmpty) {
+ // println(output)
+ import spark.implicits._
+ if (fileListSize > 1) {
+ val array: JSONArray = JSON.parseArray(output)
+ var combinedDF: DataFrame = null
+ array.forEach {
+ o =>
+ val jsonString = o.toString
+ val df = spark.read.json(Seq(jsonString).toDS)
+ if (combinedDF == null) {
+ combinedDF = df
+ } else {
+ combinedDF = combinedDF.union(df)
+ }
+ }
+ combinedDF.show(10)
+ out.write(combinedDF)
+ } else {
+ val df = spark.read.json(Seq(output).toDS())
+ df.show(10)
+ out.write(new SciDataFrame(df))
+ }
+ } else {
+ println(s"########## Exception: $error")
+ throw new Exception(s"########## Exception: $error")
+ }
+ //delete local temp file
+ if ("hdfs".equals(fileSource)) {
+ UnstructuredUtils.deleteTempFiles(localDir)
+ }
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ filePath = MapUtil.get(map, "filePath").asInstanceOf[String]
+ fileSource = MapUtil.get(map, "fileSource").asInstanceOf[String]
+ strategy = MapUtil.get(map, "strategy").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of the file(.pdf)")
+ .defaultValue("")
+ .required(true)
+ .example("/test/test.pdf")
+ descriptor = descriptor :+ filePath
+
+ val fileSource = new PropertyDescriptor()
+ .name("fileSource")
+ .displayName("FileSource")
+ .description("The source of the file ")
+ .defaultValue("true")
+ .allowableValues(Set("hdfs", "nfs"))
+ .required(true)
+ .example("hdfs")
+ descriptor = descriptor :+ fileSource
+
+ val strategy = new PropertyDescriptor()
+ .name("strategy")
+ .displayName("strategy")
+ .description("The method the method that will be used to process the file ")
+ .defaultValue("true")
+ .allowableValues(Set("auto", "hi_res", "ocr_only", "fast"))
+ .required(true)
+ .example("auto")
+ descriptor = descriptor :+ strategy
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/unstructured/PdfParser.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List("unstructured")
+ }
+
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PptxParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PptxParser.scala
new file mode 100644
index 00000000..1e6dc772
--- /dev/null
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/unstructured/PptxParser.scala
@@ -0,0 +1,146 @@
+package cn.piflow.bundle.unstructured
+
+import cn.piflow.conf.bean.PropertyDescriptor
+import cn.piflow.conf.util.{ImageUtil, MapUtil, ProcessUtil}
+import cn.piflow.conf.{ConfigurableStop, Port}
+import cn.piflow.util.{SciDataFrame, UnstructuredUtils}
+import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
+import com.alibaba.fastjson2.{JSON, JSONArray}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import scala.collection.mutable.ArrayBuffer
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
+
+class PptxParser extends ConfigurableStop {
+ val authorEmail: String = "tianyao@cnic.cn"
+ val description: String = "parse pptx to structured data."
+ val inportList: List[String] = List(Port.DefaultPort)
+ val outportList: List[String] = List(Port.DefaultPort)
+
+ var filePath: String = _
+ var fileSource: String = _
+
+
+ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
+ val spark = pec.get[SparkSession]()
+
+ val unstructuredHost: String = UnstructuredUtils.unstructuredHost()
+ val unstructuredPort: String = UnstructuredUtils.unstructuredPort()
+ if (unstructuredHost == null || unstructuredHost.isEmpty) {
+ println("########## Exception: can not parse, unstructured host is null!!!")
+ throw new Exception("########## Exception: can not parse, unstructured host is null!!!")
+ } else if ("127.0.0.1".equals(unstructuredHost) || "localhost".equals(unstructuredHost)) {
+ println("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ throw new Exception("########## Exception: can not parse, the unstructured host cannot be set to localhost!!!")
+ }
+ var localDir = ""
+ if ("hdfs".equals(fileSource)) {
+ //Download the file to the location,
+ localDir = UnstructuredUtils.downloadFilesFromHdfs(filePath)
+ }
+
+ //Create a mutable ArrayBuffer to store the parameters of the curl command
+ println("curl start==========================================================================")
+ val curlCommandParams = new ArrayBuffer[String]()
+ curlCommandParams += "curl"
+ curlCommandParams += "-X"
+ curlCommandParams += "POST"
+ curlCommandParams += s"$unstructuredHost:$unstructuredPort/general/v0/general"
+ curlCommandParams += "-H"
+ curlCommandParams += "accept: application/json"
+ curlCommandParams += "-H"
+ curlCommandParams += "Content-Type: multipart/form-data"
+ var fileListSize = 0;
+ if ("hdfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(localDir)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ if ("nfs".equals(fileSource)) {
+ val fileList = UnstructuredUtils.getLocalFilePaths(filePath)
+ fileListSize = fileList.size
+ fileList.foreach { path =>
+ curlCommandParams += "-F"
+ curlCommandParams += s"files=@$path"
+ }
+ }
+ val (output, error): (String, String) = ProcessUtil.executeCommand(curlCommandParams.toSeq)
+ if (output.nonEmpty) {
+ // println(output)
+ import spark.implicits._
+ if (fileListSize > 1) {
+ val array: JSONArray = JSON.parseArray(output)
+ var combinedDF: DataFrame = null
+ array.forEach {
+ o =>
+ val jsonString = o.toString
+ val df = spark.read.json(Seq(jsonString).toDS)
+ if (combinedDF == null) {
+ combinedDF = df
+ } else {
+ combinedDF = combinedDF.union(df)
+ }
+ }
+ combinedDF.show(10)
+ out.write(combinedDF)
+ } else {
+ val df = spark.read.json(Seq(output).toDS())
+ df.show(10)
+ out.write(new SciDataFrame(df))
+ }
+ } else {
+ println(s"########## Exception: $error")
+ throw new Exception(s"########## Exception: $error")
+ }
+ //delete local temp file
+ if ("hdfs".equals(fileSource)) {
+ UnstructuredUtils.deleteTempFiles(localDir)
+ }
+ }
+
+ override def setProperties(map: Map[String, Any]): Unit = {
+ filePath = MapUtil.get(map, "filePath").asInstanceOf[String]
+ fileSource = MapUtil.get(map, "fileSource").asInstanceOf[String]
+ }
+
+ override def getPropertyDescriptor(): List[PropertyDescriptor] = {
+ var descriptor: List[PropertyDescriptor] = List()
+ val filePath = new PropertyDescriptor()
+ .name("filePath")
+ .displayName("FilePath")
+ .description("The path of the file(.pptx)")
+ .defaultValue("")
+ .required(true)
+ .example("/test/test.pptx")
+ descriptor = descriptor :+ filePath
+
+ val fileSource = new PropertyDescriptor()
+ .name("fileSource")
+ .displayName("FileSource")
+ .description("The source of the file ")
+ .defaultValue("true")
+ .allowableValues(Set("hdfs", "nfs"))
+ .required(true)
+ .example("hdfs")
+ descriptor = descriptor :+ fileSource
+
+ descriptor
+ }
+
+ override def getIcon(): Array[Byte] = {
+ ImageUtil.getImage("icon/unstructured/PptxParser.png")
+ }
+
+ override def getGroup(): List[String] = {
+ List("unstructured")
+ }
+
+
+ override def initialize(ctx: ProcessContext): Unit = {
+
+ }
+
+}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/CustomView.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/CustomView.scala
index 3d9681d0..d7663b13 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/CustomView.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/CustomView.scala
@@ -39,7 +39,7 @@ class CustomView extends ConfigurableVisualizationStop {
val hdfs = PropertyUtil.getVisualDataDirectoryPath()
val appID = spark.sparkContext.applicationId
- val df = in.read()
+ val df = in.read().getSparkDf
val filePath= hdfs + appID + "/" + pec.getStopJob().getStopName()
df.repartition(1).write
.format("csv")
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/Histogram.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/Histogram.scala
index a4753b56..a9e40bbe 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/Histogram.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/Histogram.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableVisualizationStop, Port, StopGroup, VisualizationType}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class Histogram extends ConfigurableVisualizationStop{
override val authorEmail: String = "xjzhu@cnic.cn"
override val description: String = "Show data with histogram. " +
@@ -59,7 +59,7 @@ class Histogram extends ConfigurableVisualizationStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.createOrReplaceTempView("Histoqram")
if(this.customizedProperties != null || this.customizedProperties.size != 0){
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/LineChart.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/LineChart.scala
index 12a5fccf..b3e6ce2b 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/LineChart.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/LineChart.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.{ConfigurableVisualizationStop, Port, StopGroup, Visualiza
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class LineChart extends ConfigurableVisualizationStop{
override val authorEmail: String = "xjzhu@cnic.cn"
override val description: String = "Show data with scatter plot. " +
@@ -53,7 +53,7 @@ class LineChart extends ConfigurableVisualizationStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.createOrReplaceTempView("LineChart")
if(this.customizedProperties != null || this.customizedProperties.size != 0){
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/PieChart.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/PieChart.scala
index 8c6ab4ad..dc11f192 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/PieChart.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/PieChart.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableVisualizationStop, Port, StopGroup, VisualizationType}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class PieChart extends ConfigurableVisualizationStop {
override val authorEmail: String = "xjzhu@cnic.cn"
override val description: String = "Show data with pie chart. "
@@ -74,7 +74,7 @@ class PieChart extends ConfigurableVisualizationStop {
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext = spark.sqlContext
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.createOrReplaceTempView("PieChart")
val sqlText = "select " + dimension + "," +indicatorOption+ "(" + indicator + ") from PieChart group by " + dimension;
println("PieChart Sql: " + sqlText)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/ScatterPlotChart.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/ScatterPlotChart.scala
index b02dcc43..452e3dfe 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/ScatterPlotChart.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/ScatterPlotChart.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.{ConfigurableVisualizationStop, Port, StopGroup, Visualiza
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class ScatterPlotChart extends ConfigurableVisualizationStop{
override val authorEmail: String = "xjzhu@cnic.cn"
override val description: String = "Show data with scatter plot chart." +
@@ -51,7 +51,7 @@ class ScatterPlotChart extends ConfigurableVisualizationStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.createOrReplaceTempView("ScatterPlot")
if(this.customizedProperties != null || this.customizedProperties.size != 0){
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/TableShow.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/TableShow.scala
index ffe3dbd3..5fc187ac 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/TableShow.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/visualization/TableShow.scala
@@ -5,7 +5,7 @@ import cn.piflow.conf.{ConfigurableVisualizationStop, Port, StopGroup, Visualiza
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class TableShow extends ConfigurableVisualizationStop{
override var visualizationType: String = VisualizationType.Table
override val authorEmail: String = "xjzhu@cnic.cn"
@@ -46,7 +46,7 @@ class TableShow extends ConfigurableVisualizationStop{
override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
val spark = pec.get[SparkSession]()
val sqlContext=spark.sqlContext
- val dataFrame = in.read()
+ val dataFrame = in.read().getSparkDf
dataFrame.createOrReplaceTempView("TableShow")
val sqlText = "select " + showField+ " from TableShow"
println("TableShow Sql: " + sqlText)
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParser.scala
index 4ebb36c6..0e403f5d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParser.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import scala.beans.BeanProperty
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class XmlParser extends ConfigurableStop {
val authorEmail: String = "xjzhu@cnic.cn"
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserColumns.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserColumns.scala
index 517b2ec4..2f70ea8e 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserColumns.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserColumns.scala
@@ -5,9 +5,10 @@ import cn.piflow.bundle.util.XmlToJson
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
+import cn.piflow.util.SciDataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class XmlParserColumns extends ConfigurableStop {
@@ -22,7 +23,7 @@ class XmlParserColumns extends ConfigurableStop {
val spark = pec.get[SparkSession]()
- val df = in.read()
+ val df = in.read().getSparkDf
spark.sqlContext.udf.register("xmlToJson",(str:String)=>{
XmlToJson.xmlParse(str.replaceAll("\n","\t"))
@@ -50,7 +51,7 @@ class XmlParserColumns extends ConfigurableStop {
val outDF: DataFrame = spark.read.json(rdd)
outDF.printSchema()
- out.write(outDF)
+ out.write(new SciDataFrame(outDF))
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserFolder.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserFolder.scala
index fab166d9..390ffa5d 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserFolder.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlParserFolder.scala
@@ -12,7 +12,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
/**
* Created by admin on 2018/8/27.
*/
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlSave.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlSave.scala
index ab6f6e25..7bacfa1a 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlSave.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlSave.scala
@@ -18,7 +18,7 @@ class XmlSave extends ConfigurableStop{
var xmlSavePath:String = _
def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
- val xmlDF = in.read()
+ val xmlDF = in.read().getSparkDf
xmlDF.write.format("xml").save(xmlSavePath)
}
diff --git a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlStringParser.scala b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlStringParser.scala
index 4932b08a..8f8a54cd 100644
--- a/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlStringParser.scala
+++ b/piflow-bundle/src/main/scala/cn/piflow/bundle/xml/XmlStringParser.scala
@@ -13,7 +13,7 @@ import org.dom4j.{Document, DocumentHelper, Element}
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
-
+import cn.piflow.SciDataFrameImplicits.autoWrapDataFrame
class XmlStringParser extends ConfigurableStop {
override val authorEmail: String = "yangqidong@cnic.cn"
val inportList: List[String] = List(Port.DefaultPort)
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephReadTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephReadTest.scala
new file mode 100644
index 00000000..3a4be9e0
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephReadTest.scala
@@ -0,0 +1,50 @@
+package cn.piflow.bundle.ceph
+
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+object CephReadTest {
+
+ var cephAccessKey: String = _
+ var cephSecretKey: String = _
+ var cephEndpoint: String = _
+ var types: String = _
+ var path: String = _
+ var header: Boolean = _
+ var delimiter: String = _
+
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession.builder().
+ master("local[*]").
+ appName("CephReadTest").
+ getOrCreate()
+
+ spark.conf.set("fs.s3a.access.key", cephAccessKey)
+ spark.conf.set("fs.s3a.secret.key", cephSecretKey)
+ spark.conf.set("fs.s3a.endpoint", cephEndpoint)
+ spark.conf.set("fs.s3a.connection.ssl.enabled", "false")
+
+ var df:DataFrame = null
+
+ if (types == "parquet") {
+ df = spark.read
+ .parquet(path)
+ }
+
+ if (types == "csv") {
+
+ df = spark.read
+ .option("header", header)
+ .option("inferSchema", "true")
+ .option("delimiter", delimiter)
+ .csv(path)
+ }
+
+ if (types == "json") {
+ df = spark.read
+ .json(path)
+ }
+ df.show()
+
+ }
+
+}
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephWriteTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephWriteTest.scala
new file mode 100644
index 00000000..3f4237ff
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/ceph/CephWriteTest.scala
@@ -0,0 +1,55 @@
+package com.dkl.s3.spark
+
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+object CephWriteTest {
+ var cephAccessKey: String = _
+ var cephSecretKey: String = _
+ var cephEndpoint: String = _
+ var types: String = _
+ var path: String = _
+ var header: Boolean = _
+ var delimiter: String = _
+
+
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession.builder().
+ master("local[*]").
+ appName("SparkS3Demo").
+ getOrCreate()
+
+ spark.conf.set("fs.s3a.access.key", cephAccessKey)
+ spark.conf.set("fs.s3a.secret.key", cephSecretKey)
+ spark.conf.set("fs.s3a.endpoint", cephEndpoint)
+ spark.conf.set("fs.s3a.connection.ssl.enabled","false")
+
+
+ import spark.implicits._
+ val df = Seq((1, "json", 10, 1000, "2022-09-27")).toDF("id", "name", "value", "ts", "dt")
+
+ if (types == "parquet") {
+ df.write
+ .format("parquet")
+ .mode("overwrite") // only overwrite
+ .save(path)
+ }
+
+ if (types == "csv") {
+ df.write
+ .format("csv")
+ .option("header", header)
+ .option("delimiter", delimiter)
+ .mode("overwrite")
+ .save(path)
+ }
+
+ if (types == "json") {
+ df.write
+ .format("json")
+ .mode("overwrite")
+ .save(path)
+ }
+
+ }
+
+}
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/DiscretizationTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/DiscretizationTest.scala
new file mode 100644
index 00000000..745b8d90
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/DiscretizationTest.scala
@@ -0,0 +1,52 @@
+//package cn.piflow.bundle.normalization
+//
+//import cn.piflow.Runner
+//import cn.piflow.conf.bean.FlowBean
+//import cn.piflow.conf.util.{FileUtil, OptionUtil}
+//import cn.piflow.util.PropertyUtil
+//import org.apache.spark.sql.SparkSession
+//import org.h2.tools.Server
+//import org.junit.Test
+//
+//import scala.util.parsing.json.JSON
+//
+//class DiscretizationTest {
+//
+// @Test
+// def DiscretizationFlow(): Unit = {
+//
+// //parse flow json
+// val file = "src/main/resources/flow/normalization/Discretization.json"
+// val flowJsonStr = FileUtil.fileReader(file)
+// val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+// println(map)
+//
+// //create flow
+// val flowBean = FlowBean(map)
+// val flow = flowBean.constructFlow()
+//
+// val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start()
+//
+// //execute flow
+// val spark = SparkSession.builder()
+// .master("local[*]")
+// .appName("DiscretizationTest")
+// .config("spark.driver.memory", "1g")
+// .config("spark.executor.memory", "2g")
+// .config("spark.cores.max", "2")
+// .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris"))
+// .enableHiveSupport()
+// .getOrCreate()
+//
+// val process = Runner.create()
+// .bind(classOf[SparkSession].getName, spark)
+// .bind("checkpoint.path", "")
+// .bind("debug.path","")
+// .start(flow);
+//
+// process.awaitTermination();
+// val pid = process.pid();
+// println(pid + "!!!!!!!!!!!!!!!!!!!!!")
+// spark.close();
+// }
+//}
\ No newline at end of file
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/MaxMinNormalizationTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/MaxMinNormalizationTest.scala
new file mode 100644
index 00000000..03d9a024
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/MaxMinNormalizationTest.scala
@@ -0,0 +1,101 @@
+////package cn.piflow.bundle.normalization
+////
+////import cn.piflow.Runner
+////import cn.piflow.conf.bean.FlowBean
+////import cn.piflow.conf.util.{FileUtil, OptionUtil}
+////import org.apache.spark.sql.SparkSession
+////import org.junit.Test
+////import scala.util.parsing.json.JSON
+////
+////class MaxMinNormalizationTest {
+////
+//// @Test
+//// def MaxMinNormalizationTest(): Unit = {
+//// // Parse flow JSON
+//// val file = "src/main/resources/flow/normalization/MaxMinNormalization.json"
+//// val flowJsonStr = FileUtil.fileReader(file)
+//// val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+//// println(map)
+////
+//// // Create SparkSession
+//// val spark = SparkSession.builder()
+//// .master("local[*]")
+//// .appName("MaxMinNormalizationTest")
+//// .config("spark.driver.memory", "1g")
+//// .config("spark.executor.memory", "2g")
+//// .config("spark.cores.max", "2")
+//// .getOrCreate()
+////
+//// // Create flow
+//// val flowBean = FlowBean(map)
+//// val flow = flowBean.constructFlow()
+////
+//// // Execute flow
+//// val process = Runner.create()
+//// .bind(classOf[SparkSession].getName, spark)
+//// .bind("checkpoint.path", "")
+//// .bind("debug.path", "")
+//// .start(flow)
+////
+//// process.awaitTermination()
+//// val pid = process.pid()
+//// println(s"Flow execution completed. PID: $pid")
+////
+//// // Close SparkSession
+//// spark.close()
+//// }
+////}
+//
+//
+//package cn.piflow.bundle.normalization
+//
+//import cn.piflow.Runner
+//import cn.piflow.conf.bean.FlowBean
+//import cn.piflow.conf.util.{FileUtil, OptionUtil}
+//import cn.piflow.util.PropertyUtil
+//import org.apache.spark.sql.SparkSession
+//import org.h2.tools.Server
+//import org.junit.Test
+//
+//import scala.util.parsing.json.JSON
+//
+//class MaxMinNormalizationTest {
+//
+// @Test
+// def MaxMinNormalizationFlow(): Unit = {
+//
+// //parse flow json
+// val file = "src/main/resources/flow/normalization/MaxMinNormalization.json"
+// val flowJsonStr = FileUtil.fileReader(file)
+// val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+// println(map)
+//
+// //create flow
+// val flowBean = FlowBean(map)
+// val flow = flowBean.constructFlow()
+//
+// val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start()
+//
+// //execute flow
+// val spark = SparkSession.builder()
+// .master("local[*]")
+// .appName("MaxMinNormalizationTest")
+// .config("spark.driver.memory", "1g")
+// .config("spark.executor.memory", "2g")
+// .config("spark.cores.max", "2")
+// .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris"))
+// .enableHiveSupport()
+// .getOrCreate()
+//
+// val process = Runner.create()
+// .bind(classOf[SparkSession].getName, spark)
+// .bind("checkpoint.path", "")
+// .bind("debug.path","")
+// .start(flow);
+//
+// process.awaitTermination();
+// val pid = process.pid();
+// println(pid + "!!!!!!!!!!!!!!!!!!!!!")
+// spark.close();
+// }
+//}
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ScopeNormalizationTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ScopeNormalizationTest.scala
new file mode 100644
index 00000000..965067b5
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ScopeNormalizationTest.scala
@@ -0,0 +1,52 @@
+//package cn.piflow.bundle.normalization
+//
+//import cn.piflow.Runner
+//import cn.piflow.conf.bean.FlowBean
+//import cn.piflow.conf.util.{FileUtil, OptionUtil}
+//import cn.piflow.util.PropertyUtil
+//import org.apache.spark.sql.SparkSession
+//import org.h2.tools.Server
+//import org.junit.Test
+//
+//import scala.util.parsing.json.JSON
+//
+//class ScopeNormalizationTest {
+//
+// @Test
+// def ScopeNormalizationFlow(): Unit = {
+//
+// //parse flow json
+// val file = "src/main/resources/flow/normalization/ScopeNormalization.json"
+// val flowJsonStr = FileUtil.fileReader(file)
+// val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+// println(map)
+//
+// //create flow
+// val flowBean = FlowBean(map)
+// val flow = flowBean.constructFlow()
+//
+// val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start()
+//
+// //execute flow
+// val spark = SparkSession.builder()
+// .master("local[*]")
+// .appName("MaxMinNormalizationTest")
+// .config("spark.driver.memory", "1g")
+// .config("spark.executor.memory", "2g")
+// .config("spark.cores.max", "2")
+// .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris"))
+// .enableHiveSupport()
+// .getOrCreate()
+//
+// val process = Runner.create()
+// .bind(classOf[SparkSession].getName, spark)
+// .bind("checkpoint.path", "")
+// .bind("debug.path","")
+// .start(flow);
+//
+// process.awaitTermination();
+// val pid = process.pid();
+// println(pid + "!!!!!!!!!!!!!!!!!!!!!")
+// spark.close();
+// }
+//}
diff --git a/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ZScoreTest.scala b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ZScoreTest.scala
new file mode 100644
index 00000000..12587327
--- /dev/null
+++ b/piflow-bundle/src/test/scala/cn/piflow/bundle/normalization/ZScoreTest.scala
@@ -0,0 +1,52 @@
+//package cn.piflow.bundle.normalization
+//
+//import cn.piflow.Runner
+//import cn.piflow.conf.bean.FlowBean
+//import cn.piflow.conf.util.{FileUtil, OptionUtil}
+//import cn.piflow.util.PropertyUtil
+//import org.apache.spark.sql.SparkSession
+//import org.h2.tools.Server
+//import org.junit.Test
+//
+//import scala.util.parsing.json.JSON
+//
+//class ZScoreTest {
+//
+// @Test
+// def ZScoreFlow(): Unit = {
+//
+// //parse flow json
+// val file = "src/main/resources/flow/normalization/ZScore.json"
+// val flowJsonStr = FileUtil.fileReader(file)
+// val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
+// println(map)
+//
+// //create flow
+// val flowBean = FlowBean(map)
+// val flow = flowBean.constructFlow()
+//
+// val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start()
+//
+// //execute flow
+// val spark = SparkSession.builder()
+// .master("local[*]")
+// .appName("ZScoreTest")
+// .config("spark.driver.memory", "1g")
+// .config("spark.executor.memory", "2g")
+// .config("spark.cores.max", "2")
+// .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris"))
+// .enableHiveSupport()
+// .getOrCreate()
+//
+// val process = Runner.create()
+// .bind(classOf[SparkSession].getName, spark)
+// .bind("checkpoint.path", "")
+// .bind("debug.path","")
+// .start(flow);
+//
+// process.awaitTermination();
+// val pid = process.pid();
+// println(pid + "!!!!!!!!!!!!!!!!!!!!!")
+// spark.close();
+// }
+//}
\ No newline at end of file
diff --git a/piflow-configure/src/main/scala/cn/piflow/conf/StopGroup.scala b/piflow-configure/src/main/scala/cn/piflow/conf/StopGroup.scala
index acbbff78..48e722cb 100644
--- a/piflow-configure/src/main/scala/cn/piflow/conf/StopGroup.scala
+++ b/piflow-configure/src/main/scala/cn/piflow/conf/StopGroup.scala
@@ -4,6 +4,7 @@ object StopGroup {
val NSFC = "NSFC"
val CommonGroup = "Common"
val CsvGroup = "CSV"
+ val FlightGroup = "Flight"
val HiveGroup = "Hive"
val JdbcGroup = "Jdbc"
val JsonGroup = "Json"
@@ -34,4 +35,6 @@ object StopGroup {
val Alg_ASRGroup = "Algorithms_ASR"
val Python = "Python"
val Visualization = "Visualization"
+ val CephGroup="ceph"
+ val NormalizationGroup = "Normalization"
}
diff --git a/piflow-configure/src/main/scala/cn/piflow/conf/util/ClassUtil.scala b/piflow-configure/src/main/scala/cn/piflow/conf/util/ClassUtil.scala
index 53e0c0db..e753f5d7 100644
--- a/piflow-configure/src/main/scala/cn/piflow/conf/util/ClassUtil.scala
+++ b/piflow-configure/src/main/scala/cn/piflow/conf/util/ClassUtil.scala
@@ -11,7 +11,7 @@ import net.liftweb.json.{JValue, compactRender}
import org.clapper.classutil.ClassFinder
import org.reflections.Reflections
import net.liftweb.json.JsonDSL._
-import sun.misc.BASE64Encoder
+import java.util.Base64
import util.control.Breaks._
@@ -202,7 +202,6 @@ object ClassUtil {
val stopName = bundle.split("\\.").last
val propertyDescriptorList:List[PropertyDescriptor] = stop.getPropertyDescriptor()
propertyDescriptorList.foreach(p=> if (p.allowableValues == null || p.allowableValues == None) p.allowableValues = List(""))
- val base64Encoder = new BASE64Encoder()
var iconArrayByte : Array[Byte]= Array[Byte]()
try{
iconArrayByte = stop.getIcon()
@@ -230,7 +229,7 @@ object ClassUtil {
("customizedAllowValue" -> "")*/
("visualizationType" -> visualizationType) ~
("description" -> stop.description) ~
- ("icon" -> base64Encoder.encode(iconArrayByte)) ~
+ ("icon" -> Base64.getEncoder.encodeToString(iconArrayByte)) ~
("properties" ->
propertyDescriptorList.map { property =>(
("name" -> property.name) ~
diff --git a/piflow-configure/src/main/scala/cn/piflow/conf/util/ProcessUtil.scala b/piflow-configure/src/main/scala/cn/piflow/conf/util/ProcessUtil.scala
new file mode 100644
index 00000000..618080be
--- /dev/null
+++ b/piflow-configure/src/main/scala/cn/piflow/conf/util/ProcessUtil.scala
@@ -0,0 +1,42 @@
+package cn.piflow.conf.util
+
+import java.io.{ByteArrayOutputStream, PrintStream}
+object ProcessUtil {
+
+ /**
+ * 执行外部命令并返回标准输出和标准错误输出
+ *
+ * @param command 要执行的命令及其参数
+ * @return 一个包含标准输出和标准错误输出的元组
+ */
+ def executeCommand(command: Seq[String]): (String, String) = {
+ val processBuilder = new ProcessBuilder(command: _*)
+ val outBuffer = new ByteArrayOutputStream()
+ val errBuffer = new ByteArrayOutputStream()
+ val outStream = new PrintStream(outBuffer)
+ val errStream = new PrintStream(errBuffer)
+
+ val process = processBuilder.start()
+ val threadOut = new Thread(() => scala.io.Source.fromInputStream(process.getInputStream()).getLines().foreach(outStream.println))
+ val threadErr = new Thread(() => scala.io.Source.fromInputStream(process.getErrorStream()).getLines().foreach(errStream.println))
+
+ threadOut.start()
+ threadErr.start()
+
+ // 等待进程结束
+ process.waitFor()
+ threadOut.join()
+ threadErr.join()
+
+ // 关闭流
+ outStream.close()
+ errStream.close()
+
+ // 获取输出和错误字符串
+ val output = outBuffer.toString("UTF-8")
+ val error = errBuffer.toString("UTF-8")
+
+ // 返回输出和错误
+ (output, error)
+ }
+}
diff --git a/piflow-configure/src/main/scala/cn/piflow/conf/util/ScalaExecutorUtil.scala b/piflow-configure/src/main/scala/cn/piflow/conf/util/ScalaExecutorUtil.scala
index c463a634..17328ba3 100644
--- a/piflow-configure/src/main/scala/cn/piflow/conf/util/ScalaExecutorUtil.scala
+++ b/piflow-configure/src/main/scala/cn/piflow/conf/util/ScalaExecutorUtil.scala
@@ -73,7 +73,7 @@ object ScalaExecutorUtil {
def main(args: Array[String]): Unit = {
val script =
"""
- |val df = in.read()
+ |val df = in.read().getSparkDf
|df.show()
|val df1 = df.select("title")
|out.write(df1)
diff --git a/piflow-core/pom.xml b/piflow-core/pom.xml
index b84642aa..39dc899c 100644
--- a/piflow-core/pom.xml
+++ b/piflow-core/pom.xml
@@ -22,6 +22,16 @@
lift-json_2.12
3.3.0
+
+ org.apache.spark
+ spark-core_2.12
+ ${spark.version}
+
+
+ org.apache.spark
+ spark-sql_2.12
+ ${spark.version}
+
diff --git a/piflow-core/src/main/java/cn/piflow/util/SciDataFrame.java b/piflow-core/src/main/java/cn/piflow/util/SciDataFrame.java
new file mode 100644
index 00000000..c68f829a
--- /dev/null
+++ b/piflow-core/src/main/java/cn/piflow/util/SciDataFrame.java
@@ -0,0 +1,189 @@
+package cn.piflow.util;
+
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowFileReader;
+import org.apache.arrow.vector.ipc.ArrowFileWriter;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.*;
+import org.apache.spark.api.java.function.MapFunction;
+import java.util.*;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
+import scala.Function1;
+import scala.collection.TraversableOnce;
+
+//public class SciDataFrame implements Iterable {
+public class SciDataFrame {
+ public enum Level { FOLDER, FILE }
+ public enum FileFormat {
+ TEXT, JSON, PARQUET;
+ public String getFormatString() {
+ return this.name().toLowerCase();
+ }
+ }
+
+ // Fields
+ private UUID id;
+ private List