using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
begin
iowa_file_path = "data//home-data-for-ml-course//train.csv"
home_data = CSV.read(iowa_file_path, DataFrame)
y = home_data.SalePrice
feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd]
X = home_data[:, feature_names]
end
LotArea | YearBuilt | 1stFlrSF | 2ndFlrSF | FullBath | BedroomAbvGr | TotRmsAbvGrd | |
---|---|---|---|---|---|---|---|
1 | 8450 | 2003 | 856 | 854 | 2 | 3 | 8 |
2 | 9600 | 1976 | 1262 | 0 | 2 | 3 | 6 |
3 | 11250 | 2001 | 920 | 866 | 2 | 3 | 6 |
4 | 9550 | 1915 | 961 | 756 | 1 | 3 | 7 |
5 | 14260 | 2000 | 1145 | 1053 | 2 | 4 | 9 |
6 | 14115 | 1993 | 796 | 566 | 1 | 1 | 5 |
7 | 10084 | 2004 | 1694 | 0 | 2 | 3 | 7 |
8 | 10382 | 1973 | 1107 | 983 | 2 | 3 | 7 |
9 | 6120 | 1931 | 1022 | 752 | 2 | 2 | 8 |
10 | 7420 | 1939 | 1077 | 0 | 1 | 2 | 5 |
... | |||||||
1460 | 9937 | 1965 | 1256 | 0 | 1 | 3 | 6 |
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
RandomForestRegressor
forest = Forest()
RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 100, sampling_fraction = 0.7, feature_importance = :impurity, rng = Random._GLOBAL_RNG())
mach = machine(forest, X, y, scitype_check_level=0)
untrained Machine; caches model-specific representations of data model: RandomForestRegressor(max_depth = -1, …) args: 1: Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach)
trained Machine; caches model-specific representations of data model: RandomForestRegressor(max_depth = -1, …) args: 1: Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
test_data_path = "data//home-data-for-ml-course//test.csv"
"data//home-data-for-ml-course//test.csv"
test_data = CSV.read(test_data_path, DataFrame)
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | ... | |
---|---|---|---|---|---|---|---|---|---|
1 | 1461 | 20 | "RH" | "80" | 11622 | "Pave" | "NA" | "Reg" | |
2 | 1462 | 20 | "RL" | "81" | 14267 | "Pave" | "NA" | "IR1" | |
3 | 1463 | 60 | "RL" | "74" | 13830 | "Pave" | "NA" | "IR1" | |
4 | 1464 | 60 | "RL" | "78" | 9978 | "Pave" | "NA" | "IR1" | |
5 | 1465 | 120 | "RL" | "43" | 5005 | "Pave" | "NA" | "IR1" | |
6 | 1466 | 60 | "RL" | "75" | 10000 | "Pave" | "NA" | "IR1" | |
7 | 1467 | 20 | "RL" | "NA" | 7980 | "Pave" | "NA" | "IR1" | |
8 | 1468 | 60 | "RL" | "63" | 8402 | "Pave" | "NA" | "IR1" | |
9 | 1469 | 20 | "RL" | "85" | 10176 | "Pave" | "NA" | "Reg" | |
10 | 1470 | 20 | "RL" | "70" | 8400 | "Pave" | "NA" | "Reg" | |
... | |||||||||
1459 | 2919 | 60 | "RL" | "74" | 9627 | "Pave" | "NA" | "Reg" |
test_X = test_data[:,feature_names];
test_preds = predict(mach, test_X)
1459-element Vector{Float64}: 124402.0 156678.5 189795.02 181392.5 184786.04 199828.75 174828.0 ⋮ 87690.0 86760.0 86984.0 156340.46 133946.5 238451.1
output = DataFrame(:ID => test_data.Id,:SalePrice => test_preds)
ID | SalePrice | |
---|---|---|
1 | 1461 | 124402.0 |
2 | 1462 | 1.56678e5 |
3 | 1463 | 189795.0 |
4 | 1464 | 1.81392e5 |
5 | 1465 | 184786.0 |
6 | 1466 | 1.99829e5 |
7 | 1467 | 174828.0 |
8 | 1468 | 1.75705e5 |
9 | 1469 | 1.89534e5 |
10 | 1470 | 1.13977e5 |
... | ||
1459 | 2919 | 2.38451e5 |
CSV.write("data//home-data-for-ml-course//submissions.csv", output)
"data//home-data-for-ml-course//submissions.csv"
ŷ = predict(mach, X)
1460-element Vector{Float64}: 207936.81 175583.0 216197.0 144564.75 255722.28 149060.02 276293.93 ⋮ 184721.05 176207.0 224053.0 233962.0 133793.75 153000.5
mean_absolute_error(ŷ, y)
11248.703209589035
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/randomforestcompetition01x04.jl) and open it with Pluto.jl._