diff --git a/Project.toml b/Project.toml index 5303a45..63140f6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "FeatureSelection" uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6" authors = ["Anthony D. Blaom ", "Samuel Okon transform(fit!(machine(selector, X)), X) ``` """ -FeatureSelector \ No newline at end of file +FeatureSelector diff --git a/src/models/rfe.jl b/src/models/rfe.jl index db636b9..a2e9f21 100644 --- a/src/models/rfe.jl +++ b/src/models/rfe.jl @@ -1,7 +1,7 @@ function warn_double_spec(arg, model) return "Using `model=$arg`. Ignoring keyword specification `model=$model`. " end - + const ERR_SPECIFY_MODEL = ArgumentError( "You need to specify model as positional argument or specify `model=...`." ) @@ -36,66 +36,67 @@ for (ModelType, ModelSuperType) in MODELTYPE_GIVEN_SUPERTYPES eval(ex) end -eval(:(const RFE{M} = Union{$((Expr(:curly, modeltype, :M) for modeltype in MODEL_TYPES)...)})) +eval(:(const RFE{M} = + Union{$((Expr(:curly, modeltype, :M) for modeltype in MODEL_TYPES)...)})) # Common keyword constructor for both model types """ RecursiveFeatureElimination(model, n_features, step) -This model implements a recursive feature elimination algorithm for feature selection. -It recursively removes features, training a base model on the remaining features and +This model implements a recursive feature elimination algorithm for feature selection. +It recursively removes features, training a base model on the remaining features and evaluating their importance until the desired number of features is selected. -Construct an instance with default hyper-parameters using the syntax -`model = RecursiveFeatureElimination(model=...)`. Provide keyword arguments to override -hyper-parameter defaults. - +Construct an instance with default hyper-parameters using the syntax +`rfe_model = RecursiveFeatureElimination(model=...)`. Provide keyword arguments to override +hyper-parameter defaults. + # Training data -In MLJ or MLJBase, bind an instance `model` to data with +In MLJ or MLJBase, bind an instance `rfe_model` to data with - mach = machine(model, X, y) + mach = machine(rfe_model, X, y) OR, if the base model supports weights, as - mach = machine(model, X, y, w) + mach = machine(rfe_model, X, y, w) Here: - `X` is any table of input features (eg, a `DataFrame`) whose columns are of the scitype - as that required by the base model; check column scitypes with `schema(X)` and column + as that required by the base model; check column scitypes with `schema(X)` and column scitypes required by base model with `input_scitype(basemodel)`. -- `y` is the target, which can be any table of responses whose element scitype is - `Continuous` or `Finite` depending on the `target_scitype` required by the base model; +- `y` is the target, which can be any table of responses whose element scitype is + `Continuous` or `Finite` depending on the `target_scitype` required by the base model; check the scitype with `scitype(y)`. -- `w` is the observation weights which can either be `nothing`(default) or an - `AbstractVector` whoose element scitype is `Count` or `Continuous`. This is different +- `w` is the observation weights which can either be `nothing`(default) or an + `AbstractVector` whoose element scitype is `Count` or `Continuous`. This is different from `weights` kernel which is an hyperparameter to the model, see below. Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters -- model: A base model with a `fit` method that provides information on feature +- model: A base model with a `fit` method that provides information on feature feature importance (i.e `reports_feature_importances(model) == true`) -- n_features::Real = 0: The number of features to select. If `0`, half of the - features are selected. If a positive integer, the parameter is the absolute number - of features to select. If a real number between 0 and 1, it is the fraction of features +- n_features::Real = 0: The number of features to select. If `0`, half of the + features are selected. If a positive integer, the parameter is the absolute number + of features to select. If a real number between 0 and 1, it is the fraction of features to select. -- step::Real=1: If the value of step is at least 1, it signifies the quantity of features to - eliminate in each iteration. Conversely, if step falls strictly within the range of +- step::Real=1: If the value of step is at least 1, it signifies the quantity of features to + eliminate in each iteration. Conversely, if step falls strictly within the range of 0.0 to 1.0, it denotes the proportion (rounded down) of features to remove during each iteration. # Operations -- `transform(mach, X)`: transform the input table `X` into a new table containing only +- `transform(mach, X)`: transform the input table `X` into a new table containing only columns corresponding to features gotten from the RFE algorithm. -- `predict(mach, X)`: transform the input table `X` into a new table same as in +- `predict(mach, X)`: transform the input table `X` into a new table same as in -- `transform(mach, X)` above and predict using the fitted base model on the +- `transform(mach, X)` above and predict using the fitted base model on the transformed table. # Fitted parameters @@ -106,11 +107,11 @@ The fields of `fitted_params(mach)` are: # Report The fields of `report(mach)` are: -- `ranking`: The feature ranking of each features in the training dataset. +- `ranking`: The feature ranking of each features in the training dataset. - `model_report`: report for the fitted base model. -- `features`: names of features seen during the training process. +- `features`: names of features seen during the training process. # Examples ``` @@ -131,10 +132,10 @@ selector = RecursiveFeatureElimination(model = rf) mach = machine(selector, X, y) fit!(mach) -# view the feature importances +# view the feature importances feature_importances(mach) -# predict using the base model +# predict using the base model Xnew = MLJ.table(rand(rng, 50, 10)); predict(mach, Xnew) @@ -160,7 +161,7 @@ function RecursiveFeatureElimination( #TODO: Check that the specifed model implements the predict method. # probably add a trait to check this MMI.reports_feature_importances(model) || throw(ERR_FEATURE_IMPORTANCE_SUPPORT) - if model isa Deterministic + if model isa Deterministic selector = DeterministicRecursiveFeatureElimination{typeof(model)}( model, Float64(n_features), Float64(step) ) @@ -170,7 +171,7 @@ function RecursiveFeatureElimination( ) else throw(ERR_MODEL_TYPE) - end + end message = MMI.clean!(selector) isempty(message) || @warn(message) return selector @@ -204,7 +205,7 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...) n_features_select = selector.n_features ## zero indicates that half of the features be selected. if n_features_select == 0 - n_features_select = div(nfeatures, 2) + n_features_select = div(nfeatures, 2) elseif 0 < n_features_select < 1 n_features_select = round(Int, n_features_select * nfeatures) else @@ -212,13 +213,13 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...) end step = selector.step - + if 0 < step < 1 step = round(Int, max(1, step * n_features_select)) else - step = round(Int, step) + step = round(Int, step) end - + support = trues(nfeatures) ranking = ones(Int, nfeatures) # every feature has equal rank initially mask = trues(nfeatures) # for boolean indexing of ranking vector in while loop below @@ -230,7 +231,7 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...) # Rank the remaining features model = selector.model verbosity > 0 && @info("Fitting estimator with $(n_features_left) features.") - + data = MMI.reformat(model, MMI.selectcols(X, features_left), args...) fitresult, _, report = MMI.fit(model, verbosity - 1, data...) @@ -263,14 +264,14 @@ function MMI.fit(selector::RFE, verbosity::Int, X, y, args...) data = MMI.reformat(selector.model, MMI.selectcols(X, features_left), args...) verbosity > 0 && @info ("Fitting estimator with $(n_features_left) features.") model_fitresult, _, model_report = MMI.fit(selector.model, verbosity - 1, data...) - + fitresult = ( support = support, model_fitresult = model_fitresult, features_left = features_left, features = features ) - report = ( + report = ( ranking = ranking, model_report = model_report ) @@ -294,7 +295,7 @@ end function MMI.transform(::RFE, fitresult, X) sch = Tables.schema(Tables.columns(X)) - if (length(fitresult.features) == length(sch.names) && + if (length(fitresult.features) == length(sch.names) && !all(e -> e in sch.names, fitresult.features)) throw( ERR_FEATURES_SEEN @@ -312,7 +313,7 @@ function MMI.save(model::RFE, fitresult) atomic_fitresult = fitresult.model_fitresult features_left = fitresult.features_left features = fitresult.features - + atom = model.model return ( support = copy(support), @@ -337,14 +338,12 @@ function MMI.restore(model::RFE, serializable_fitresult) ) end -## Traits definitions -function MMI.load_path(::Type{<:DeterministicRecursiveFeatureElimination}) - return "FeatureSelection.DeterministicRecursiveFeatureElimination" -end +## Trait definitions -function MMI.load_path(::Type{<:ProbabilisticRecursiveFeatureElimination}) - return "FeatureSelection.ProbabilisticRecursiveFeatureElimination" -end +# load path points to constructor not type: +MMI.load_path(::Type{<:RFE}) = "FeatureSelection.RecursiveFeatureElimination" +MMI.constructor(::Type{<:RFE}) = RecursiveFeatureElimination +MMI.package_name(::Type{<:RFE}) = "FeatureSelection" for trait in [ :supports_weights, @@ -387,4 +386,17 @@ end ## TRAINING LOSSES SUPPORT function MMI.training_losses(model::RFE, rfe_report) return MMI.training_losses(model.model, rfe_report.model_report) -end \ No newline at end of file +end + +## Pkg Traits +MMI.metadata_pkg.( + ( + DeterministicRecursiveFeatureElimination, + ProbabilisticRecursiveFeatureElimination, + ), + package_name = "FeatureSelection", + package_uuid = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6", + package_url = "https://github.com/JuliaAI/FeatureSelection.jl", + is_pure_julia = true, + package_license = "MIT" +) diff --git a/test/models/rfe.jl b/test/models/rfe.jl index 3910b36..d936e4f 100644 --- a/test/models/rfe.jl +++ b/test/models/rfe.jl @@ -17,12 +17,15 @@ const DTM = DummyTestModels @test_throws FeatureSelection.ERR_SPECIFY_MODEL RecursiveFeatureElimination() reg = DTM.DeterministicConstantRegressor() @test_throws( - FeatureSelection.ERR_FEATURE_IMPORTANCE_SUPPORT, + FeatureSelection.ERR_FEATURE_IMPORTANCE_SUPPORT, RecursiveFeatureElimination(model = DTM.DeterministicConstantRegressor()) ) rf = MLJDecisionTreeInterface.RandomForestRegressor(rng = rng) selector = RecursiveFeatureElimination(model = rf) @test selector isa FeatureSelection.DeterministicRecursiveFeatureElimination + @test MLJBase.constructor(selector) == RecursiveFeatureElimination + @test MLJBase.package_name(selector) == "FeatureSelection" + @test MLJBase.load_path(selector) == "FeatureSelection.RecursiveFeatureElimination" # Fit selector_mach = machine(selector, Xt, y) @@ -34,7 +37,7 @@ const DTM = DummyTestModels selector_mach.model.model, selector_mach.fitresult.model_fitresult ) @test feature_importances(selector_mach) == [ - :x1 => 6.0, :x2 => 5.0, :x3 => 4.0, :x4 => 3.0, :x5 => 2.0, + :x1 => 6.0, :x2 => 5.0, :x3 => 4.0, :x4 => 3.0, :x5 => 2.0, :x6 => 1.0, :x7 => 1.0, :x8 => 1.0, :x9 => 1.0, :x10 => 1.0 ] rpt = report(selector_mach) @@ -94,7 +97,7 @@ end measure = rms, tuning = Grid(rng=rng), resampling = StratifiedCV(nfolds = 5), - range = range(rfecv, :n_features, values = 1:10) + range = range(rfecv, :n_features, values = 1:10) ) self_tuning_rfe_mach = machine(tuning_rfe_model, Xs, ys) fit!(self_tuning_rfe_mach) @@ -127,4 +130,4 @@ end mach2 = MLJBase.machine(io) close(io) @test MLJBase.predict(mach2, (; x1=rand(2), x2 = rand(2))) == yhat -end \ No newline at end of file +end