|
19 | 19 | use arrow_json::reader::infer_json_schema_from_iterator;
|
20 | 20 | use arrow_schema::Schema;
|
21 | 21 | use once_cell::sync::OnceCell;
|
22 |
| -use std::collections::HashMap; |
| 22 | +use serde::Deserialize; |
| 23 | +use serde_json::Value; |
| 24 | +use std::{collections::HashMap, fs, path::Path}; |
23 | 25 |
|
24 | 26 | use crate::{event::format::update_data_type_to_datetime, utils::json::flatten_json_body};
|
25 | 27 |
|
26 | 28 | // Expose some static variables for internal usage
|
27 | 29 | pub static KNOWN_SCHEMA_LIST: OnceCell<HashMap<String, Schema>> = OnceCell::new();
|
| 30 | +const FORMATS_JSON: &str = include_str!("known-formats/formats.json"); |
| 31 | + |
| 32 | +#[derive(Debug, Deserialize)] |
| 33 | +struct Format { |
| 34 | + name: String, |
| 35 | + schema_type: String, |
| 36 | + sample_json_path: String, |
| 37 | +} |
28 | 38 |
|
29 | 39 | pub fn detect_schema() -> HashMap<String, Schema> {
|
30 | 40 | let mut known_schema_list: HashMap<String, Schema> = HashMap::new();
|
31 |
| - //read file formats.json |
32 |
| - let formats_file = std::fs::File::open("src/event/known-formats/formats.json").unwrap(); |
33 |
| - let formats_reader = std::io::BufReader::new(formats_file); |
34 |
| - let formats: serde_json::Value = serde_json::from_reader(formats_reader).unwrap(); |
35 |
| - //iterate over the formats |
36 |
| - for format in formats.as_array().unwrap() { |
37 |
| - let schema_type = format["schema_type"].as_str().unwrap(); |
38 |
| - let sample_json_path = format["sample_json_path"].as_str().unwrap(); |
39 |
| - let sample_file = std::fs::File::open(sample_json_path).unwrap(); |
40 |
| - let sample_reader = std::io::BufReader::new(sample_file); |
41 |
| - let sample_json: serde_json::Value = serde_json::from_reader(sample_reader).unwrap(); |
42 |
| - let flattened_json = flatten_json_body(sample_json, None, None, None, false).unwrap(); |
43 |
| - let sample_json_records = [flattened_json.clone()]; |
44 |
| - let mut schema = |
45 |
| - infer_json_schema_from_iterator(sample_json_records.iter().map(Ok)).unwrap(); |
46 |
| - schema = update_data_type_to_datetime(schema, flattened_json, Vec::new()); |
47 |
| - known_schema_list.insert(schema_type.to_string(), schema); |
| 41 | + let json_data: serde_json::Value = serde_json::from_str(FORMATS_JSON).unwrap(); |
| 42 | + |
| 43 | + let formats: Vec<Format> = |
| 44 | + serde_json::from_value(json_data).expect("Failed to parse formats.json"); |
| 45 | + |
| 46 | + for format in &formats { |
| 47 | + let sample_path = Path::new(&format.sample_json_path); |
| 48 | + let schema_type = &format.schema_type; |
| 49 | + let _name = &format.name; |
| 50 | + match fs::read_to_string(sample_path) { |
| 51 | + Ok(content) => match serde_json::from_str::<Value>(&content) { |
| 52 | + Ok(json) => { |
| 53 | + let flattened_json = flatten_json_body(json, None, None, None, false).unwrap(); |
| 54 | + let sample_json_records = [flattened_json.clone()]; |
| 55 | + let mut schema = |
| 56 | + infer_json_schema_from_iterator(sample_json_records.iter().map(Ok)) |
| 57 | + .unwrap(); |
| 58 | + schema = update_data_type_to_datetime(schema, flattened_json, Vec::new()); |
| 59 | + known_schema_list.insert(schema_type.to_string(), schema); |
| 60 | + } |
| 61 | + Err(err) => eprintln!("Invalid JSON in {}: {}", sample_path.display(), err), |
| 62 | + }, |
| 63 | + Err(err) => eprintln!("Failed to read {}: {}", sample_path.display(), err), |
| 64 | + } |
48 | 65 | }
|
49 | 66 | prepare_known_schema_list(known_schema_list.clone());
|
50 | 67 | known_schema_list
|
|
0 commit comments