- plutoTemplate

using DataFrames, CSV, MLJ, MLJDecisionTreeInterface

begin
    iowa_file_path = "data//home-data-for-ml-course//train.csv"
    home_data = CSV.read(iowa_file_path, DataFrame);
    y = home_data.SalePrice;
    feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
    X = home_data[:, feature_names];
    
    # Specify Model
    Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
    iowa_model = Tree()
    mach = machine(iowa_model, X, y, scitype_check_level=0)
    
    # Fit Model
    fit!(mach)
    predictions = predict(mach, X)
    
    println("First in-sample predictions:", predictions[1:5])
    println("Actual target values for those homes:", y[1:5])
    mean_absolute_error(predictions, y)
end

16686.449680908874

(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true);

function get_mae(min_samples_leaf, Xtrain, Xtest, ytrain, ytest)
    model = DecisionTreeRegressor(min_samples_leaf = min_samples_leaf, rng=0)
    mach1 = machine(model, Xtrain, ytrain, scitype_check_level=0)
    fit!(mach1)
    preds_val = predict(mach1, Xtest)
    mae = mean_absolute_error(preds_val, ytest)
    return (mae)
end

get_mae (generic function with 1 method)

#candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
candidate_max_leaf_nodes = [5,6,7,8,9,10]

6-element Vector{Int64}:
  5
  6
  7
  8
  9
 10

scores = [get_mae(leaf_size, Xtrain, Xtest, ytrain, ytest) => leaf_size for leaf_size in candidate_max_leaf_nodes]

6-element Vector{Pair{Float64, Int64}}:
 26826.471499238956 => 5
 27065.523599127268 => 6
 27542.850943833557 => 7
 28093.636413500884 => 8
  27197.37198413102 => 9
 27653.827009645032 => 10

best_tree_size = (minimum(scores)).second

final_model = Tree(min_samples_leaf = best_tree_size, rng = 123)

DecisionTreeRegressor(
  max_depth = -1, 
  min_samples_leaf = 5, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  feature_importance = :impurity, 
  rng = 123)

final_mach = machine(final_model, X, y, scitype_check_level=0)

untrained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @086 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @318 ⏎ AbstractVector{ScientificTypesBase.Count}

fit!(final_mach)

trained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @086 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @318 ⏎ AbstractVector{ScientificTypesBase.Count}

ŷ = predict(final_mach, X)

1460-element Vector{Float64}:
 212000.0
 163600.0
 207066.66666666666
 127600.0
 264816.6666666667
 140375.0
 264580.0
      ⋮
 182591.42857142858
 175700.0
 225066.66666666666
 256480.0
 133045.0
 154485.7142857143

mean_absolute_error(ŷ, y)

16686.449680908874

Built with Julia 1.9.1 and

CSV 0.10.9
DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/modelvalidation01x02.jl) and open it with Pluto.jl.

Julia Tutorials Template