1+ {-# LANGUAGE AllowAmbiguousTypes #-}
2+ {-# LANGUAGE GADTs #-}
13{-# LANGUAGE OverloadedStrings #-}
4+ {-# LANGUAGE ScopedTypeVariables #-}
25{-# LANGUAGE StrictData #-}
36{-# LANGUAGE TypeApplications #-}
47
@@ -13,9 +16,20 @@ import Data.Int
1316import qualified Data.Map as M
1417import Data.Maybe
1518import qualified Data.Text as T
19+ import Data.Typeable (Typeable )
20+ import qualified Data.Vector as V
21+ import qualified Data.Vector.Unboxed as VU
1622import Data.Word
1723import DataFrame.IO.Parquet.Binary
1824import DataFrame.IO.Parquet.Types
25+ import qualified DataFrame.Internal.Column as DI
26+ import DataFrame.Internal.DataFrame (DataFrame , unsafeGetColumn )
27+ import qualified DataFrame.Operations.Core as DI
28+ import Type.Reflection (
29+ eqTypeRep ,
30+ typeRep ,
31+ (:~~:) (HRefl ),
32+ )
1933
2034data SchemaElement = SchemaElement
2135 { elementName :: T. Text
@@ -31,6 +45,39 @@ data SchemaElement = SchemaElement
3145 }
3246 deriving (Show , Eq )
3347
48+ createParquetSchema :: DataFrame -> [SchemaElement ]
49+ createParquetSchema df = schemaDef : map toSchemaElement (DI. columnNames df)
50+ where
51+ -- The schema always contains an initial element
52+ -- indicating the group of fields.
53+ schemaDef =
54+ SchemaElement
55+ { elementName = " schema"
56+ , elementType = STOP
57+ , typeLength = 0
58+ , numChildren = fromIntegral (snd (DI. dimensions df))
59+ , fieldId = - 1
60+ , repetitionType = UNKNOWN_REPETITION_TYPE
61+ , convertedType = 0
62+ , scale = 0
63+ , precision = 0
64+ , logicalType = LOGICAL_TYPE_UNKNOWN
65+ }
66+ toSchemaElement colName =
67+ let
68+ colType :: TType
69+ colType = case unsafeGetColumn colName df of
70+ (DI. BoxedColumn (col :: V. Vector a )) -> haskellToTType @ a
71+ (DI. UnboxedColumn (col :: VU. Vector a )) -> haskellToTType @ a
72+ (DI. OptionalColumn (col :: V. Vector (Maybe a ))) -> haskellToTType @ a
73+ lType =
74+ if DI. hasElemType @ T. Text (unsafeGetColumn colName df)
75+ || DI. hasElemType @ (Maybe T. Text ) (unsafeGetColumn colName df)
76+ then STRING_TYPE
77+ else LOGICAL_TYPE_UNKNOWN
78+ in
79+ SchemaElement colName colType 0 0 (- 1 ) OPTIONAL 0 0 0 lType
80+
3481data KeyValue = KeyValue
3582 { key :: String
3683 , value :: String
@@ -68,6 +115,29 @@ data TType
68115 | UUID
69116 deriving (Show , Eq )
70117
118+ haskellToTType :: forall a . (Typeable a ) => TType
119+ haskellToTType
120+ | is @ Bool = BOOL
121+ | is @ Int8 = BYTE
122+ | is @ Word8 = BYTE
123+ | is @ Int16 = I16
124+ | is @ Word16 = I16
125+ | is @ Int32 = I32
126+ | is @ Word32 = I32
127+ | is @ Int64 = I64
128+ | is @ Word64 = I64
129+ | is @ Float = FLOAT
130+ | is @ Double = DOUBLE
131+ | is @ String = STRING
132+ | is @ T. Text = STRING
133+ | is @ BS. ByteString = STRING
134+ | otherwise = STOP
135+ where
136+ is :: forall x . (Typeable x ) => Bool
137+ is = case eqTypeRep (typeRep @ a ) (typeRep @ x ) of
138+ Just HRefl -> True
139+ Nothing -> False
140+
71141defaultMetadata :: FileMetadata
72142defaultMetadata =
73143 FileMetaData
0 commit comments