diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl index 86d51285..6254561a 100644 --- a/src/byrow/byrow.jl +++ b/src/byrow/byrow.jl @@ -6,6 +6,8 @@ nunique(::_DUMMY_STRUCT) = false stdze!(::_DUMMY_STRUCT) = false stdze(::_DUMMY_STRUCT) = false select(::_DUMMY_STRUCT) = false +rescale(::_DUMMY_STRUCT) = false +rescale(::_DUMMY_STRUCT) = false byrow(ds::AbstractDataset, ::typeof(Base.sum), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, threads = nrow(ds) > Threads.nthreads()*10) = row_sum(ds, by, cols, threads = threads) byrow(ds::AbstractDataset, ::typeof(Base.sum), col::ColumnIndex; by = identity, threads = nrow(ds) > Threads.nthreads()*10) = byrow(ds, sum, [col]; by = by, threads = threads) @@ -225,6 +227,10 @@ byrow(ds::AbstractDataset, ::typeof(stdze), cols::MultiColumnIndex = names(ds, U byrow(ds::AbstractDataset, ::typeof(stdze!), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); threads = true) = row_stdze!(ds, cols, threads = threads) +byrow(ds::AbstractDataset, ::typeof(rescale), cols::MultiColumnIndex=names(ds, Union{Missing,Number}); range=[0, 1], threads=true) = row_rescale(ds, cols, range=range, threads=threads) + +byrow(ds::AbstractDataset, ::typeof(rescale!), cols::MultiColumnIndex=names(ds, Union{Missing,Number}); range=[0, 1], threads=true) = row_rescale!(ds, cols, range=range, threads=threads) + function byrow(ds::AbstractDataset, ::typeof(hash), cols::MultiColumnIndex = :; by = identity, mapformats = false, threads = nrow(ds) > Threads.nthreads()*10) colsidx = multiple_getindex(index(ds), cols) if mapformats diff --git a/src/byrow/doc.jl b/src/byrow/doc.jl index 066d33b7..b82f9f54 100644 --- a/src/byrow/doc.jl +++ b/src/byrow/doc.jl @@ -78,6 +78,10 @@ function Docs.getdoc(x::typeof(byrow), y) return _get_doc_byrow("stdze!") elseif y == Tuple{typeof(stdze)} return _get_doc_byrow("stdze") + elseif y == Tuple{typeof(rescale!)} + return _get_doc_byrow("rescale!") + elseif y == Tuple{typeof(rescale)} + return _get_doc_byrow("rescale") else return _get_doc_byrow("generic") end @@ -145,6 +149,9 @@ Perform a row-wise operation specified by `fun` on selected columns `cols`. Gene - `sort!` - `stdze` - `stdze!` +- `rescale` +- `rescale!` + @@@@sum@@@@ byrow(ds::AbstractDataset, sum, cols = names(ds, Number); [by = identity, threads]) @@ -1287,6 +1294,28 @@ julia> byrow(ds,stdze!,:) byrow(ds::AbstractDataset, stdze, cols; [threads]) Variant of `byrow(stdze!)` which pass a copy of `ds` and leave `ds` untouched. + +@@@@rescale!@@@@ + byrow(ds::Dataset, rescale!, cols; [range = [0, 1], threads]) + +Replace each value in each row of `ds` for selected `cols` by its rescaled values. +Also known as min-max scaling or min-max normalization, rescaling is the simplest method and consists in rescaling the range of features to scale the range. +The formula to rescale a range between an arbitrary set of values [a, b] is given as: a + ((x-min(x))(b-a)/(max(x)-min(x)). + +Missing values are skipped from the calculation. When all values in a row are missing, it returns `missing`. +If the maximum value of a row is equal to the minimum value of a row, the result will also be `missing`. + + +Passing `range = [minval, mxval]` to define the range of rescale result. +Passing `threads = false` disables multithreaded computations. + +See [`byrow(rescale)`](@ref) + +@@@@rescale@@@@ + byrow(ds::AbstractDataset, rescale, cols; [range = [0, 1], threads]) + +Variant of `byrow(rescale!)` which pass a copy of `ds` and leave `ds` untouched. + @@@@generic@@@@ byrow(ds::AbstractDataset, fun, cols; [threads]) diff --git a/src/byrow/row_functions.jl b/src/byrow/row_functions.jl index f7f4058e..d0095c95 100644 --- a/src/byrow/row_functions.jl +++ b/src/byrow/row_functions.jl @@ -1002,6 +1002,31 @@ function row_stdze(ds::AbstractDataset , cols = names(ds, Union{Missing, Number} dscopy end +function row_rescale!(ds::Dataset, cols=names(ds, Union{Missing,Number}); range, threads=true) + colsidx = IMD.index(ds)[cols] + + mindata = IMD.row_minimum(ds, colsidx; threads=threads) + maxdata = IMD.row_maximum(ds, colsidx; threads=threads) + max_min = maxdata .- mindata + + _rescale_fun(x) = ifelse.(isequal.(max_min, 0), missing, range[1] .+ (((x .- mindata) .* (range[2] - range[1])) ./ max_min)) + + for i in 1:length(colsidx) + IMD._columns(ds)[colsidx[i]] = _rescale_fun(IMD._columns(ds)[colsidx[i]]) + end + removeformat!(ds, colsidx) + any(IMD.index(ds).sortedcols .∈ Ref(colsidx)) && IMD._reset_grouping_info!(ds) + IMD._modified(IMD._attributes(ds)) + ds +end + +function row_rescale(ds::AbstractDataset, cols=names(ds, Union{Missing,Number}); range, threads=true) + dscopy = copy(ds) + row_rescale!(dscopy, cols; range=range, threads=threads) + dscopy +end + + function row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...) colsidx = index(ds)[cols] T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx])