using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
using Plots
begin
train_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
test_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
end;
y, X_train_data = unpack(train_data, ==(:SalePrice));
numeric_cols = [cname for cname in propertynames(X_train_data) if eltype(X_train_data[:,cname]) <: Union{Number,Missing}]
37-element Vector{Symbol}: :Id :MSSubClass :LotFrontage :LotArea :OverallQual :OverallCond :YearBuilt ⋮ Symbol("3SsnPorch") :ScreenPorch :PoolArea :MiscVal :MoSold :YrSold
X = X_train_data[:, numeric_cols];
X_test = test_data[:, numeric_cols];
preprocessor = FillImputer()
FillImputer( features = Symbol[], continuous_fill = MLJModels._median, count_fill = MLJModels._round_median, finite_fill = MLJModels._mode)
Forest = @load RandomForestRegressor pkg=DecisionTree
RandomForestRegressor
model = Forest(n_trees=50, rng=0)
RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 50, sampling_fraction = 0.7, feature_importance = :impurity, rng = 0)
my_pipeline = preprocessor |> model
DeterministicPipeline( fill_imputer = FillImputer( features = Symbol[], continuous_fill = MLJModels._median, count_fill = MLJModels._round_median, finite_fill = MLJModels._mode), random_forest_regressor = RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 50, sampling_fraction = 0.7, feature_importance = :impurity, rng = 0), cache = true)
cv=CV(nfolds=5)
CV( nfolds = 5, shuffle = false, rng = Random._GLOBAL_RNG())
scores = evaluate(my_pipeline, X, y, resampling=cv, measure=mae, verbosity=0)
PerformanceEvaluation object with these fields: measure, operation, measurement, per_fold, per_observation, fitted_params_per_fold, report_per_fold, train_test_rows Extract: ┌─────────────────────┬───────────┬─────────────┬─────────┬───────────────────────────── │ measure │ operation │ measurement │ 1.96*SE │ per_fold ⋯ ├─────────────────────┼───────────┼─────────────┼─────────┼───────────────────────────── │ MeanAbsoluteError() │ predict │ 18000.0 │ 1010.0 │ [17600.0, 18800.0, 18300.0 ⋯ └─────────────────────┴───────────┴─────────────┴─────────┴───────────────────────────── 1 column omitted
println("Average MAE score: $(first(scores.measurement))")
function get_score(n_trees)
preprocessor = FillImputer()
model = RandomForestRegressor(n_trees=n_trees, rng=0)
my_pipeline = preprocessor |> model
scores = evaluate(my_pipeline, X, y, resampling=CV(nfolds=3), measure=mae, verbosity=0)
return first(scores.measurement)
end
get_score (generic function with 1 method)
results = []
Any[]
for i in 1:9
push!(results, get_score(50*i) => (50*i))
end
results
9-element Vector{Any}: 18071.71025477222 => 50 17858.24558134262 => 100 17744.93062062645 => 150 17743.59114376393 => 200 17734.406606957855 => 250 17684.723639670297 => 300 17681.84223165018 => 350 17718.765966673294 => 400 17701.136647137413 => 450
first.(results)
9-element Vector{Float64}: 18071.71025477222 17858.24558134262 17744.93062062645 17743.59114376393 17734.406606957855 17684.723639670297 17681.84223165018 17718.765966673294 17701.136647137413
last.(results)
9-element Vector{Int64}: 50 100 150 200 250 300 350 400 450
plot(last.(results), first.(results))
minimum(results)
17681.84223165018 => 350
n_trees_best = last(minimum(results))
350
Built with Julia 1.9.1 and
CSV 0.10.9DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Plots 1.38.7
To run this tutorial locally, download [this file](/tutorials/crossvalidation02x05.jl) and open it with Pluto.jl.