using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
using Plots
begin
train_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
test_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
end;
y, X_train_data = unpack(train_data, ==(:SalePrice));
numeric_cols = [cname for cname in propertynames(X_train_data) if eltype(X_train_data[:,cname]) <: Union{Number,Missing}]
37-element Vector{Symbol}:
:Id
:MSSubClass
:LotFrontage
:LotArea
:OverallQual
:OverallCond
:YearBuilt
⋮
Symbol("3SsnPorch")
:ScreenPorch
:PoolArea
:MiscVal
:MoSold
:YrSold
X = X_train_data[:, numeric_cols];
X_test = test_data[:, numeric_cols];
preprocessor = FillImputer()
FillImputer( features = Symbol[], continuous_fill = MLJModels._median, count_fill = MLJModels._round_median, finite_fill = MLJModels._mode)
Forest = @load RandomForestRegressor pkg=DecisionTree
RandomForestRegressor
model = Forest(n_trees=50, rng=0)
RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 50, sampling_fraction = 0.7, feature_importance = :impurity, rng = 0)
my_pipeline = preprocessor |> model
DeterministicPipeline(
fill_imputer = FillImputer(
features = Symbol[],
continuous_fill = MLJModels._median,
count_fill = MLJModels._round_median,
finite_fill = MLJModels._mode),
random_forest_regressor = RandomForestRegressor(
max_depth = -1,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = -1,
n_trees = 50,
sampling_fraction = 0.7,
feature_importance = :impurity,
rng = 0),
cache = true)
cv=CV(nfolds=5)
CV( nfolds = 5, shuffle = false, rng = Random._GLOBAL_RNG())
scores = evaluate(my_pipeline, X, y, resampling=cv, measure=mae, verbosity=0)
PerformanceEvaluation object with these fields:
measure, operation, measurement, per_fold,
per_observation, fitted_params_per_fold,
report_per_fold, train_test_rows
Extract:
┌─────────────────────┬───────────┬─────────────┬─────────┬─────────────────────────────
│ measure │ operation │ measurement │ 1.96*SE │ per_fold ⋯
├─────────────────────┼───────────┼─────────────┼─────────┼─────────────────────────────
│ MeanAbsoluteError() │ predict │ 18000.0 │ 1010.0 │ [17600.0, 18800.0, 18300.0 ⋯
└─────────────────────┴───────────┴─────────────┴─────────┴─────────────────────────────
1 column omitted
println("Average MAE score: $(first(scores.measurement))")
function get_score(n_trees)
preprocessor = FillImputer()
model = RandomForestRegressor(n_trees=n_trees, rng=0)
my_pipeline = preprocessor |> model
scores = evaluate(my_pipeline, X, y, resampling=CV(nfolds=3), measure=mae, verbosity=0)
return first(scores.measurement)
end
get_score (generic function with 1 method)
results = []
Any[]
for i in 1:9
push!(results, get_score(50*i) => (50*i))
end
results
9-element Vector{Any}:
18071.71025477222 => 50
17858.24558134262 => 100
17744.93062062645 => 150
17743.59114376393 => 200
17734.406606957855 => 250
17684.723639670297 => 300
17681.84223165018 => 350
17718.765966673294 => 400
17701.136647137413 => 450
first.(results)
9-element Vector{Float64}:
18071.71025477222
17858.24558134262
17744.93062062645
17743.59114376393
17734.406606957855
17684.723639670297
17681.84223165018
17718.765966673294
17701.136647137413
last.(results)
9-element Vector{Int64}:
50
100
150
200
250
300
350
400
450
plot(last.(results), first.(results))
minimum(results)
17681.84223165018 => 350
n_trees_best = last(minimum(results))
350
Built with Julia 1.9.1 and
CSV 0.10.9DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Plots 1.38.7
To run this tutorial locally, download [this file](/tutorials/crossvalidation02x05.jl) and open it with Pluto.jl.