-
Notifications
You must be signed in to change notification settings - Fork 6
1. Getting the Data Ready
An essential step to run your models in DashAI is to get data and define classes to deal with this form of data.It ranges from as simple as opening a csv file for text models to getting image data from folders for vision models.
Here, we need to specify some additional parameters about the data, like what batch size(bs
) to use, validation set batch size (val_bs
), device and number of worker threads to create.
Then in validation
we can describe how to create the validation set (method
) and then give extra parameters in the selected method section.
In label
we have to specify how to label the data.
More information about the parameters can be found here.
To use custom functions, pass the location of a text file containing only the Python definition of that function with the relevant name: valid_func
if the function is being used for validation-splitting, and label_func
if it is for labeling.
"data": {
"bs": 2,
"val_bs": null,
"device": null,
"no_check": false,
"num_workers": 16,
"validation": {
"method": "from_df",
"by_rand_pct": {
"valid_pct": 0.2,
"seed": null
},
"idx": {
"csv_name": null,
"valid_idx": 20
},
"subsets": {
"train_size": 0.08,
"valid_size": 0.2,
"seed": null
},
"files": {
"valid_names": null
},
"fname_file": {
"fname": null,
"path": null
},
"folder": {
"train": "train",
"valid": "train"
},
"idxs": {
"train_idx": null,
"valid_idx": null
},
"list": {
"train": null,
"valid": null
},
"by_valid_func": {
"location": ""
},
"from_df": {
"col": 2
}
},
"label": {
"method": "from_df",
"from_df": {
"cols": 0,
"label_cls": null,
"label_delim": null,
"classes": null
},
"const": {
"const": 0,
"label_cls": null
},
"from_func": {
"location": ""
},
"from_re": {
"pat": null,
"full_path": false
},
"from_folder": {
"label_cls": null
}
}
}
Metrics for training fastai models are simply functions that take input and target tensors, and return some metric of interest for training.You may choose from different metric methods like accuracy
,accuracy_thresh
,top_k_accuracy
,dice
,fbeta
,Precision
,Recall
,KappaScore
and MultiLabelFBeta
"metric": {
"methods": [
"accuracy"
],
"accuracy_thresh": {
"thresh": 0.5,
"sigmoid": true
},
"top_k_accuracy": {
"k": 5
},
"dice": {
"iou": false,
"eps": 1e-8
},
"fbeta": {
"thresh": 0.2,
"beta": 2,
"eps": 1e-9,
"sigmoid": true
},
"Precision": {
"average": "binary",
"pos_label": 1,
"eps": 1e-9
},
"Recall": {
"average": "binary",
"pos_label": 1,
"eps": 1e-9
},
"FBeta": {
"average": "binary",
"pos_label": 1,
"eps": 1e-9,
"beta": 2
},
"KappaScore": {
"weights": null
},
"MultiLabelFbeta": {
"beta": 2,
"eps": 1e-15,
"thresh": 0.3,
"sigmoid": true,
"average": "micro"
}
}
DashAI allows usage of predefined loss functions like BCEFlat
,BCEWithLogitsFlat
,CrossEntropyFlat
and MSELossFlat
"loss": {
"type": "pre-defined",
"pre-defined": {
"func": "MSELossFlat"
},
"custom": {
"fname": null,
"func": null
}
}
Helps use various optimizer functions provided by PyTorch to optimize your models.You may select your preferred Optimizers from the set of methods provided in available_opts
"optimizer": {
"available_opts": [
"SGD",
"RMSProp",
"Adam",
"AdamW",
"Adadelta",
"Adagrad",
"SparseAdam",
"Adamax",
"ASGD"
],
"chosen_opt": "AdamW",
"arguments": {
"SGD": {
"lr": 0,
"momentum": 0,
"weight_decay": 0,
"dampening": 0,
"nesterov": false
},
"RMSProp": {
"lr": 0.01,
"momentum": 0,
"alpha": 0.99,
"eps": 1e-8,
"centered": false,
"weight_decay": 0
},
"Adam": {
"lr": 0.001,
"momentum": 0.9,
"alpha": 0.999,
"eps": 1e-8,
"weight_decay": 0,
"amsgrad": false
},
"AdamW": {
"lr": 0.001,
"momentum": 0.9,
"alpha": 0.999,
"eps": 1e-8,
"weight_decay": 0.01,
"amsgrad": false
},
"Adadelta": {
"lr": 1,
"rho": 0.9,
"eps": 0.000001,
"weight_decay": 0
},
"Adagrad": {
"lr": 0.01,
"lr_decay": 0,
"eps": 1e-10,
"weight_decay": 0
},
"SparseAdam": {
"lr": 0.001,
"momentum": 0.9,
"alpha": 0.999,
"eps": 1e-8
},
"Adamax": {
"lr": 0.002,
"momentum": 0.9,
"alpha": 0.999,
"eps": 1e-8,
"weight_decay": 0.01
},
"ASGD": {
"lr": 0.01,
"lambd": 0.0001,
"alpha": 0.75,
"t0": 1000000,
"weight_decay": 0
}
}
}
Tabular Data usually comes in the form of a delimiter file(such as .csv).
During this process, we require you to mention the path of the csv file in(csv_name
).The dependent variable needs to be mentioned in(dep_var
).The columns which are categorical and continuous are given in (cat_name
) and (cont_names
).Finally you can mention the path to the csv file under test_df
which can behave as your test dataset if has_test
is True
"input": {
"csv_name": "./data/adult_sample/adult.csv",
"dep_var": "salary",
"cat_names": [
"workclass", "education", "marital-status", "occupation",
"relationship", "race"
],
"cont_names": [
"age", "fnlwgt", "education-num"
],
"test_df": {
"has_test": false,
"csv_name": null
}
}
The above example uses a sample of the adult dataset which has some census information on individuals. We'll use it to train a model to predict whether salary is greater than $50k or not.
"transform": {
"FillMissing": {
"fill_strategy": "MEDIAN",
"add_col": true,
"fill_val": 0
},
"Categorify": true,
"Normalize": true,
"Datetime": {
"cols": [],
"cyclic": false
}
},
Here we require you to define datasets handling Image objects and their transformations.Choose subtask
as per your application.
-
classification-single-label
-> -
classification-multi-label
-> -
regression
-> -
segmentation
->
We accept two methods to provide data for the model.
(from_folder
) -> Imagenet style of datasets (Preferred Method -> Use mnist_tiny folder structure for your providing your images.
(from_csv
) -> A csv file with column of filenames and a column of labels which can be strings for classification, strings separated by a label_delim
for multi-classification or floats for a regression problem
"subtask": "classification-single-label",
"input": {
"method": "from_folder",
"from_folder": {
"path": "data/mnist_tiny",
"extensions": null,
"recurse": true,
"exclude": null,
"include": null,
"processor": null,
"presort": false
},
"from_csv": {
"csv_name": null,
"path":null,
"cols": 0,
"delimiter": null,
"header": "infer",
"processor": null
}
},
"classification-single-label": {},
"classification-multi-label": {},
"regression": {},
"segmentation": {
"path_lbl":"data/camvid_tiny/labels",
"codes":"data/camvid_tiny/codes.txt"
},
"gan": {
"noise_sz": 100
},
"object-detection": {}
For this example, we provide the data folder containing a MNIST subset.It will grab the data in a train and validation sets from subfolders of classes.
Vision Transformations contain all the transforms we can use for data augmentation.
"transform":
{
"size":24,
"data_aug":["basic_transforms","zoom_crop","manual"],
"manual_augs":["brightness","contrast","crop","crop_pad","dihedral","dihedral_affine","flip_lr","flip_affine","jitter","pad","rotate","rgb_randomize","skew","squish","symmetric_wrap","tilt","zoom","cutout"],
"chosen_aug_train":"manual",
"chosen_aug_valid":"manual",
"manual_train":["brightness","contrast"],
"manual_valid":["crop","cutout"],
"train":
{
"basic_transforms":
{
"do_flip":true,
"flip_vert":false,
"max_rotate":10.0,
"max_zoom":1,
"max_lighting":0.8,
"max_warp":0.2,
"p_affine":0.75,
"p_lighting":0.75
},
"zoom_crop":
{
"scale":[0.75,2],
"do_rand":true,
"p":1.0
},
"manual":
{
"brightness":
{
"change":0.5
},
"contrast":
{
"l_scale":1.0,
"h_scale":1.0
},
"crop":
{
"size":300,
"row_pct":0.5,
"col_pct":0.5
},
"crop_pad":
{
"size":300,
"padding_mode":"reflection",
"row_pct":0.5,
"col_pct":0.5
},
"dihedral":
{
"k":0
},
"dihedral_affine":
{
"k":0
},
"flip_lr":
{},
"flip_affine": {},
"jitter":
{
"magnitude":0.0
},
"pad":
{
"padding":1,
"available_modes":["zeros", "border", "reflection"],
"mode":"reflection"
},
"rotate":
{
"degrees":0.0
},
"rgb_randomize":
{
"channels":["Red", "Green", "Blue"],
"chosen_channels":"Red",
"thresh":[0.2,0.99,3],
"chosen_thresh":3
},
"skew":
{
"l_direction":0,
"h_direction":0,
"magnitude":0,
"invert":false
},
"squish":
{
"scale":1.0,
"row_pct":0.5,
"col_pct":0.5
},
"symmetric_wrap":
{
"magnitude":[-0.2,0.2]
},
"tilt":
{
"l_direction":0,
"h_direction":0,
"magnitude":0
},
"zoom":
{
"scale":1.0,
"row_pct":0.5,
"col_pct":0.5
},
"cutout":
{
"l_n_holes":1,
"h_n_holes":1,
"l_length":20,
"h_length":20
}
}
}
}
"input": {
"method": "from_csv",
"from_csv": {
"path": "data/imdb_sample",
"csv_name": "texts.csv",
"cols": 1,
"vocab": null
},
"from_folder": {
"path": null
}
}
Here we require the user to provide the data in the form of a csv (csv_name
).Define (user_name
),(item_name
) and (rating
) column name from the csv. You could include a test dataset in test_df
if necessary.
"input": {
"csv_name": "./data/movie_lens_sample/ratings.csv",
"user_name": "userId",
"item_name": "movieId",
"rating": ["rating"],
"test_df": {
"has_test": false,
"csv_name": null
}
}
For this example, we'll use a small subset of the MovieLens dataset to predict the rating a user would give a particular movie (from 0 to 5). The dataset comes in the form of a csv file where each line is a rating of a movie by a given person.
Collab Transforms have similar json structure as the Tabular Transforms