using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
begin
iowa_file_path = "data//home-data-for-ml-course//train.csv"
home_data = CSV.read(iowa_file_path, DataFrame)
y = home_data.SalePrice
feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd]
X = home_data[:, feature_names]
end
| LotArea | YearBuilt | 1stFlrSF | 2ndFlrSF | FullBath | BedroomAbvGr | TotRmsAbvGrd | |
|---|---|---|---|---|---|---|---|
| 1 | 8450 | 2003 | 856 | 854 | 2 | 3 | 8 |
| 2 | 9600 | 1976 | 1262 | 0 | 2 | 3 | 6 |
| 3 | 11250 | 2001 | 920 | 866 | 2 | 3 | 6 |
| 4 | 9550 | 1915 | 961 | 756 | 1 | 3 | 7 |
| 5 | 14260 | 2000 | 1145 | 1053 | 2 | 4 | 9 |
| 6 | 14115 | 1993 | 796 | 566 | 1 | 1 | 5 |
| 7 | 10084 | 2004 | 1694 | 0 | 2 | 3 | 7 |
| 8 | 10382 | 1973 | 1107 | 983 | 2 | 3 | 7 |
| 9 | 6120 | 1931 | 1022 | 752 | 2 | 2 | 8 |
| 10 | 7420 | 1939 | 1077 | 0 | 1 | 2 | 5 |
| ... | |||||||
| 1460 | 9937 | 1965 | 1256 | 0 | 1 | 3 | 6 |
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
RandomForestRegressor
forest = Forest()
RandomForestRegressor( max_depth = -1, min_samples_leaf = 1, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = -1, n_trees = 100, sampling_fraction = 0.7, feature_importance = :impurity, rng = Random._GLOBAL_RNG())
mach = machine(forest, X, y, scitype_check_level=0)
untrained Machine; caches model-specific representations of data
model: RandomForestRegressor(max_depth = -1, …)
args:
1: Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
2: Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach)
trained Machine; caches model-specific representations of data
model: RandomForestRegressor(max_depth = -1, …)
args:
1: Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
2: Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
test_data_path = "data//home-data-for-ml-course//test.csv"
"data//home-data-for-ml-course//test.csv"
test_data = CSV.read(test_data_path, DataFrame)
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | ... | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 1461 | 20 | "RH" | "80" | 11622 | "Pave" | "NA" | "Reg" | |
| 2 | 1462 | 20 | "RL" | "81" | 14267 | "Pave" | "NA" | "IR1" | |
| 3 | 1463 | 60 | "RL" | "74" | 13830 | "Pave" | "NA" | "IR1" | |
| 4 | 1464 | 60 | "RL" | "78" | 9978 | "Pave" | "NA" | "IR1" | |
| 5 | 1465 | 120 | "RL" | "43" | 5005 | "Pave" | "NA" | "IR1" | |
| 6 | 1466 | 60 | "RL" | "75" | 10000 | "Pave" | "NA" | "IR1" | |
| 7 | 1467 | 20 | "RL" | "NA" | 7980 | "Pave" | "NA" | "IR1" | |
| 8 | 1468 | 60 | "RL" | "63" | 8402 | "Pave" | "NA" | "IR1" | |
| 9 | 1469 | 20 | "RL" | "85" | 10176 | "Pave" | "NA" | "Reg" | |
| 10 | 1470 | 20 | "RL" | "70" | 8400 | "Pave" | "NA" | "Reg" | |
| ... | |||||||||
| 1459 | 2919 | 60 | "RL" | "74" | 9627 | "Pave" | "NA" | "Reg" |
test_X = test_data[:,feature_names];
test_preds = predict(mach, test_X)
1459-element Vector{Float64}:
124402.0
156678.5
189795.02
181392.5
184786.04
199828.75
174828.0
⋮
87690.0
86760.0
86984.0
156340.46
133946.5
238451.1
output = DataFrame(:ID => test_data.Id,:SalePrice => test_preds)
| ID | SalePrice | |
|---|---|---|
| 1 | 1461 | 124402.0 |
| 2 | 1462 | 1.56678e5 |
| 3 | 1463 | 189795.0 |
| 4 | 1464 | 1.81392e5 |
| 5 | 1465 | 184786.0 |
| 6 | 1466 | 1.99829e5 |
| 7 | 1467 | 174828.0 |
| 8 | 1468 | 1.75705e5 |
| 9 | 1469 | 1.89534e5 |
| 10 | 1470 | 1.13977e5 |
| ... | ||
| 1459 | 2919 | 2.38451e5 |
CSV.write("data//home-data-for-ml-course//submissions.csv", output)
"data//home-data-for-ml-course//submissions.csv"
ŷ = predict(mach, X)
1460-element Vector{Float64}:
207936.81
175583.0
216197.0
144564.75
255722.28
149060.02
276293.93
⋮
184721.05
176207.0
224053.0
233962.0
133793.75
153000.5
mean_absolute_error(ŷ, y)
11248.703209589035
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/randomforestcompetition01x04.jl) and open it with Pluto.jl._