using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
begin
iowa_file_path = "data//home-data-for-ml-course//train.csv"
home_data = CSV.read(iowa_file_path, DataFrame);
y = home_data.SalePrice;
feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = home_data[:, feature_names];
# Split into validation and training data
(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true);
# Specify Model
Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
iowa_model = Tree()
mach = machine(iowa_model, Xtrain, ytrain, scitype_check_level=0)
# Fit Model
fit!(mach, verbosity = 0)
val_predictions = predict(mach, Xtest)
val_mae = mean_absolute_error(val_predictions, ytest)
println("Validation MAE when not specifying max_leaf_nodes: $(round(Int, val_mae))")
# Using best value for max_leaf_nodes
iowa_model = Tree(min_samples_leaf=5, rng=1)
mach = machine(iowa_model, Xtrain, ytrain, scitype_check_level=0)
fit!(mach, verbosity = 0)
val_predictions = predict(mach, Xtest)
val_mae = mean_absolute_error(val_predictions, ytest)
println("Validation MAE for best value of max_leaf_nodes: $(round(Int, val_mae))")
end
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
RandomForestRegressor
forest = Forest()
RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 100, sampling_fraction = 0.7, feature_importance = :impurity, rng = Random._GLOBAL_RNG())
rf_model = machine(forest, Xtrain, ytrain, scitype_check_level=0)
untrained Machine; caches model-specific representations of data model: RandomForestRegressor(max_depth = -1, …) args: 1: Source @654 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @067 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(rf_model)
trained Machine; caches model-specific representations of data model: RandomForestRegressor(max_depth = -1, …) args: 1: Source @654 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @067 ⏎ AbstractVector{ScientificTypesBase.Count}
rf_val_predictions = predict(rf_model, Xtest)
292-element Vector{Float64}: 178613.93 134950.14 234560.9 155530.5 305688.15 310621.37 244759.2 ⋮ 156312.5 135762.4 150215.0 99155.71 255391.5 154969.5
rf_val_mae = mean_absolute_error(rf_val_predictions, ytest)
22869.537739726016
Built with Julia 1.9.1 and
CSV 0.10.9DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/randomforests01x03.jl) and open it with Pluto.jl.