Skip to content

Commit bb7d567

Browse files
committed
multi-thread(remaining bugs)
1 parent 284899f commit bb7d567

25 files changed

+1653
-228
lines changed

.gitignore

+6-1
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,9 @@ build/
3535
.vscode/
3636

3737
### Mac OS ###
38-
.DS_Store
38+
.DS_Store
39+
40+
### data ###
41+
#./input/
42+
./output/
43+
*.parquet

.idea/.gitignore

-8
This file was deleted.

.idea/misc.xml

+1-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

+166
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README-zn.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# ParquetWriter
2+
本项目目标是利用[parquet-java](https://github.com/apache/parquet-java)
3+
生成tpch和clickbench的parquet的数据文件(prarrow.parquet的praquetWriter依赖于prarrow-cpp,版本比较老且无法指定rowGroupSize)
4+
5+
本项目附属于[PixelsDB](https://github.com/pixelsdb)
6+
7+
## 类型转换
8+
采取parquet的Logical Type构造message
9+
10+
## 多线程读取
11+
为了使用多线程读取文件
12+
需要对文件进行切分
13+
使用linux的 spilt命令
14+
```bash
15+
# test
16+
split -l 30 test1.csv ./test1/test1-csv-
17+
split -l 40 test2.csv ./test2/test2-csv-
18+
19+
```
20+
21+
## 参考
22+
- https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
23+
- https://cloud.tencent.com/developer/article/2439115
24+
- https://trino.io/docs/current/language/types.html

convert.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import json
2+
3+
4+
field_types = {
5+
"bigint": "int64",
6+
"smallint": "int32",
7+
"varchar(65535)": "string",
8+
"timestamp": "timestamp",
9+
"date": "date",
10+
"int": "int32"
11+
}
12+
13+
14+
fields = [
15+
("watchid", "bigint"),
16+
("javaenable", "smallint"),
17+
("title", "varchar(65535)"),
18+
("goodevent", "smallint"),
19+
("eventtime", "timestamp"),
20+
("eventdate", "date"),
21+
("counterid", "int"),
22+
("clientip", "int"),
23+
("regionid", "int"),
24+
("userid", "bigint"),
25+
("counterclass", "smallint"),
26+
("os", "smallint"),
27+
("useragent", "smallint"),
28+
("url", "varchar(65535)"),
29+
("referer", "varchar(65535)"),
30+
("isrefresh", "smallint"),
31+
("referercategoryid", "smallint"),
32+
("refererregionid", "int"),
33+
("urlcategoryid", "smallint"),
34+
("urlregionid", "int"),
35+
("resolutionwidth", "smallint"),
36+
("resolutionheight", "smallint"),
37+
("resolutiondepth", "smallint"),
38+
("flashmajor", "smallint"),
39+
("flashminor", "smallint"),
40+
("flashminor2", "varchar(65535)"),
41+
("netmajor", "smallint"),
42+
("netminor", "smallint"),
43+
("useragentmajor", "smallint"),
44+
("useragentminor", "varchar(65535)"),
45+
("cookieenable", "smallint"),
46+
("javascriptenable", "smallint"),
47+
("ismobile", "smallint"),
48+
("mobilephone", "smallint"),
49+
("mobilephonemodel", "varchar(65535)"),
50+
("params", "varchar(65535)"),
51+
("ipnetworkid", "int"),
52+
("traficsourceid", "smallint"),
53+
("searchengineid", "smallint"),
54+
("searchphrase", "varchar(65535)"),
55+
("advengineid", "smallint"),
56+
("isartifical", "smallint"),
57+
("windowclientwidth", "smallint"),
58+
("windowclientheight", "smallint"),
59+
("clienttimezone", "smallint"),
60+
("clienteventtime", "timestamp"),
61+
("silverlightversion1", "smallint"),
62+
("silverlightversion2", "smallint"),
63+
("silverlightversion3", "int"),
64+
("silverlightversion4", "smallint"),
65+
("pagecharset", "varchar(65535)"),
66+
("codeversion", "int"),
67+
("islink", "smallint"),
68+
("isdownload", "smallint"),
69+
("isnotbounce", "smallint"),
70+
("funiqid", "bigint"),
71+
("originalurl", "varchar(65535)"),
72+
("hid", "int"),
73+
("isoldcounter", "smallint"),
74+
("isevent", "smallint"),
75+
("isparameter", "smallint"),
76+
("dontcounthits", "smallint"),
77+
("withhash", "smallint"),
78+
("hitcolor", "varchar(65535)"),
79+
("localeventtime", "timestamp"),
80+
("age", "smallint"),
81+
("sex", "smallint"),
82+
("income", "smallint"),
83+
("interests", "smallint"),
84+
("robotness", "smallint"),
85+
("remoteip", "int"),
86+
("windowname", "int"),
87+
("openername", "int"),
88+
("historylength", "smallint"),
89+
("browserlanguage", "varchar(65535)"),
90+
("browsercountry", "varchar(65535)"),
91+
("socialnetwork", "varchar(65535)"),
92+
("socialaction", "varchar(65535)"),
93+
("httperror", "smallint"),
94+
("sendtiming", "int"),
95+
("dnstiming", "int"),
96+
("connecttiming", "int"),
97+
("responsestarttiming", "int"),
98+
("responseendtiming", "int"),
99+
("fetchtiming", "int"),
100+
("socialsourcenetworkid", "smallint"),
101+
("socialsourcepage", "varchar(65535)"),
102+
("paramprice", "bigint"),
103+
("paramorderid", "varchar(65535)"),
104+
("paramcurrency", "varchar(65535)"),
105+
("paramcurrencyid", "smallint"),
106+
("openstatservicename", "varchar(65535)"),
107+
("openstatcampaignid", "varchar(65535)"),
108+
("openstatadid", "varchar(65535)"),
109+
("openstatsourceid", "varchar(65535)"),
110+
("utmsource", "varchar(65535)"),
111+
("utmmedium", "varchar(65535)"),
112+
("utmcampaign", "varchar(65535)"),
113+
("utmcontent", "varchar(65535)"),
114+
("utmterm", "varchar(65535)"),
115+
("fromtag", "varchar(65535)"),
116+
("hasgclid", "smallint"),
117+
("refererhash", "bigint"),
118+
("urlhash", "bigint"),
119+
("clid", "int")
120+
]
121+
122+
123+
converted_types = [field_types.get(field[1], field[1]) for field in fields]
124+
125+
126+
output = {
127+
"schema": {
128+
"hits": converted_types
129+
}
130+
}
131+
print(converted_types)
132+
133+
print(json.dumps(output, indent=2))

0 commit comments

Comments
 (0)