- plutoTemplate

using DataFrames, CSV, MLJ, MLJDecisionTreeInterface

using Plots

begin
    train_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
    test_data = CSV.read("data//home-data-for-ml-course//train.csv", missingstring="NA", DataFrame)
end;

y, X_train_data = unpack(train_data, ==(:SalePrice));

numeric_cols = [cname for cname in propertynames(X_train_data) if eltype(X_train_data[:,cname]) <: Union{Number,Missing}]

37-element Vector{Symbol}:
 :Id
 :MSSubClass
 :LotFrontage
 :LotArea
 :OverallQual
 :OverallCond
 :YearBuilt
 ⋮
 Symbol("3SsnPorch")
 :ScreenPorch
 :PoolArea
 :MiscVal
 :MoSold
 :YrSold

X = X_train_data[:, numeric_cols];

X_test = test_data[:, numeric_cols];

preprocessor = FillImputer()

FillImputer(
  features = Symbol[], 
  continuous_fill = MLJModels._median, 
  count_fill = MLJModels._round_median, 
  finite_fill = MLJModels._mode)

Forest = @load RandomForestRegressor pkg=DecisionTree

RandomForestRegressor

model = Forest(n_trees=50, rng=0)

RandomForestRegressor(
  max_depth = -1, 
  min_samples_leaf = 1, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = -1, 
  n_trees = 50, 
  sampling_fraction = 0.7, 
  feature_importance = :impurity, 
  rng = 0)

my_pipeline = preprocessor |> model

DeterministicPipeline(
  fill_imputer = FillImputer(
        features = Symbol[], 
        continuous_fill = MLJModels._median, 
        count_fill = MLJModels._round_median, 
        finite_fill = MLJModels._mode), 
  random_forest_regressor = RandomForestRegressor(
        max_depth = -1, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_purity_increase = 0.0, 
        n_subfeatures = -1, 
        n_trees = 50, 
        sampling_fraction = 0.7, 
        feature_importance = :impurity, 
        rng = 0), 
  cache = true)

cv=CV(nfolds=5)

CV(
  nfolds = 5, 
  shuffle = false, 
  rng = Random._GLOBAL_RNG())

scores = evaluate(my_pipeline, X, y, resampling=cv, measure=mae, verbosity=0)

PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌─────────────────────┬───────────┬─────────────┬─────────┬─────────────────────────────
│ measure             │ operation │ measurement │ 1.96*SE │ per_fold                   ⋯
├─────────────────────┼───────────┼─────────────┼─────────┼─────────────────────────────
│ MeanAbsoluteError() │ predict   │ 18000.0     │ 1010.0  │ [17600.0, 18800.0, 18300.0 ⋯
└─────────────────────┴───────────┴─────────────┴─────────┴─────────────────────────────
                                                                        1 column omitted

println("Average MAE score: $(first(scores.measurement))")

function get_score(n_trees)
    preprocessor = FillImputer()
    model = RandomForestRegressor(n_trees=n_trees, rng=0)
    my_pipeline = preprocessor |> model
    scores = evaluate(my_pipeline, X, y, resampling=CV(nfolds=3), measure=mae, verbosity=0)
    return first(scores.measurement)
end

get_score (generic function with 1 method)

results = []

Any[]

for i in 1:9
    push!(results, get_score(50*i) => (50*i))
end

results

9-element Vector{Any}:
  18071.71025477222 => 50
  17858.24558134262 => 100
  17744.93062062645 => 150
  17743.59114376393 => 200
 17734.406606957855 => 250
 17684.723639670297 => 300
  17681.84223165018 => 350
 17718.765966673294 => 400
 17701.136647137413 => 450

first.(results)

9-element Vector{Float64}:
 18071.71025477222
 17858.24558134262
 17744.93062062645
 17743.59114376393
 17734.406606957855
 17684.723639670297
 17681.84223165018
 17718.765966673294
 17701.136647137413

last.(results)

9-element Vector{Int64}:
  50
 100
 150
 200
 250
 300
 350
 400
 450

plot(last.(results), first.(results))

minimum(results)

17681.84223165018 => 350

n_trees_best = last(minimum(results))

Built with Julia 1.9.1 and

CSV 0.10.9
DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Plots 1.38.7

To run this tutorial locally, download [this file](/tutorials/crossvalidation02x05.jl) and open it with Pluto.jl.

Julia Tutorials Template