Skip to content

Commit fe336df

Browse files
committed
chore: Add createParquetSchema and haskellToTType.
1 parent 45344f2 commit fe336df

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

src/DataFrame/IO/Parquet/Thrift.hs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
{-# LANGUAGE AllowAmbiguousTypes #-}
2+
{-# LANGUAGE GADTs #-}
13
{-# LANGUAGE OverloadedStrings #-}
4+
{-# LANGUAGE ScopedTypeVariables #-}
25
{-# LANGUAGE StrictData #-}
36
{-# LANGUAGE TypeApplications #-}
47

@@ -13,9 +16,20 @@ import Data.Int
1316
import qualified Data.Map as M
1417
import Data.Maybe
1518
import qualified Data.Text as T
19+
import Data.Typeable (Typeable)
20+
import qualified Data.Vector as V
21+
import qualified Data.Vector.Unboxed as VU
1622
import Data.Word
1723
import DataFrame.IO.Parquet.Binary
1824
import DataFrame.IO.Parquet.Types
25+
import qualified DataFrame.Internal.Column as DI
26+
import DataFrame.Internal.DataFrame (DataFrame, unsafeGetColumn)
27+
import qualified DataFrame.Operations.Core as DI
28+
import Type.Reflection (
29+
eqTypeRep,
30+
typeRep,
31+
(:~~:) (HRefl),
32+
)
1933

2034
data SchemaElement = SchemaElement
2135
{ elementName :: T.Text
@@ -31,6 +45,39 @@ data SchemaElement = SchemaElement
3145
}
3246
deriving (Show, Eq)
3347

48+
createParquetSchema :: DataFrame -> [SchemaElement]
49+
createParquetSchema df = schemaDef : map toSchemaElement (DI.columnNames df)
50+
where
51+
-- The schema always contains an initial element
52+
-- indicating the group of fields.
53+
schemaDef =
54+
SchemaElement
55+
{ elementName = "schema"
56+
, elementType = STOP
57+
, typeLength = 0
58+
, numChildren = fromIntegral (snd (DI.dimensions df))
59+
, fieldId = -1
60+
, repetitionType = UNKNOWN_REPETITION_TYPE
61+
, convertedType = 0
62+
, scale = 0
63+
, precision = 0
64+
, logicalType = LOGICAL_TYPE_UNKNOWN
65+
}
66+
toSchemaElement colName =
67+
let
68+
colType :: TType
69+
colType = case unsafeGetColumn colName df of
70+
(DI.BoxedColumn (col :: V.Vector a)) -> haskellToTType @a
71+
(DI.UnboxedColumn (col :: VU.Vector a)) -> haskellToTType @a
72+
(DI.OptionalColumn (col :: V.Vector (Maybe a))) -> haskellToTType @a
73+
lType =
74+
if DI.hasElemType @T.Text (unsafeGetColumn colName df)
75+
|| DI.hasElemType @(Maybe T.Text) (unsafeGetColumn colName df)
76+
then STRING_TYPE
77+
else LOGICAL_TYPE_UNKNOWN
78+
in
79+
SchemaElement colName colType 0 0 (-1) OPTIONAL 0 0 0 lType
80+
3481
data KeyValue = KeyValue
3582
{ key :: String
3683
, value :: String
@@ -68,6 +115,29 @@ data TType
68115
| UUID
69116
deriving (Show, Eq)
70117

118+
haskellToTType :: forall a. (Typeable a) => TType
119+
haskellToTType
120+
| is @Bool = BOOL
121+
| is @Int8 = BYTE
122+
| is @Word8 = BYTE
123+
| is @Int16 = I16
124+
| is @Word16 = I16
125+
| is @Int32 = I32
126+
| is @Word32 = I32
127+
| is @Int64 = I64
128+
| is @Word64 = I64
129+
| is @Float = FLOAT
130+
| is @Double = DOUBLE
131+
| is @String = STRING
132+
| is @T.Text = STRING
133+
| is @BS.ByteString = STRING
134+
| otherwise = STOP
135+
where
136+
is :: forall x. (Typeable x) => Bool
137+
is = case eqTypeRep (typeRep @a) (typeRep @x) of
138+
Just HRefl -> True
139+
Nothing -> False
140+
71141
defaultMetadata :: FileMetadata
72142
defaultMetadata =
73143
FileMetaData

0 commit comments

Comments
 (0)