using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);
X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);
y = X_full.SalePrice;
features = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = copy(X_full[:,features]);
X_test = copy(X_test_full[:,features]);
(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true)
((1168×7 DataFrame Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd │ Int64 Int64 Int64 Int64 Int64 Int64 Int64 ──────┼────────────────────────────────────────────────────────────────────────────── 1 │ 8450 2003 856 854 2 3 8 2 │ 9600 1976 1262 0 2 3 6 3 │ 11250 2001 920 866 2 3 6 4 │ 9550 1915 961 756 1 3 7 5 │ 14260 2000 1145 1053 2 4 9 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ 1165 │ 16157 1978 1432 0 1 2 5 1166 │ 9541 2009 1502 0 2 3 7 1167 │ 10475 2008 1694 0 2 3 7 1168 │ 10852 2000 959 712 2 3 7 1159 rows omitted, 292×7 DataFrame Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd │ Int64 Int64 Int64 Int64 Int64 Int64 Int64 ─────┼────────────────────────────────────────────────────────────────────────────── 1 │ 13728 1935 1236 872 2 4 7 2 │ 35760 1995 1831 1796 3 4 10 3 │ 9880 1977 1118 0 1 3 6 4 │ 9120 1958 1261 0 1 3 6 5 │ 4017 2006 625 625 2 2 5 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ 289 │ 13175 1978 2073 0 2 3 7 290 │ 9042 1941 1188 1152 2 4 9 291 │ 9717 1950 1078 0 1 2 5 292 │ 9937 1965 1256 0 1 3 6 283 rows omitted), ([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000 … 235128, 185000, 146000, 224000, 129000, 108959, 194000, 233170, 245350, 173000], [235000, 625000, 171000, 163000, 171900, 200500, 239000, 285000, 119500, 115000 … 136000, 287090, 145000, 84500, 185000, 175000, 210000, 266500, 142125, 147500]))
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity = 0
RandomForestRegressor
model_1 = Forest(n_trees=50, rng=0);
model_2 = Forest(n_trees=100, rng=0);
model_3 = Forest(n_trees=100, feature_importance= :split, rng=0);
model_4 = Forest(n_trees=200, min_samples_split=20, rng=0);
model_5 = Forest(n_trees=100, max_depth=7, rng=0);
models = [model_1, model_2, model_3, model_4, model_5];
function score_model(model; X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid)
mach = machine(model, X_t, y_t, scitype_check_level=0)
fit!(mach, verbosity=0)
preds = predict(mach, X_v)
return mean_absolute_error(preds, y_v)
end
score_model (generic function with 1 method)
for i in 1:length(models)
mae = score_model(models[i])
println("Model $(i) MAE: $(mae)")
end
best_model = model_3;
my_model = machine(best_model, X, y, scitype_check_level=0);
fit!(my_model)
trained Machine; caches model-specific representations of data model: RandomForestRegressor(max_depth = -1, …) args: 1: Source @493 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @846 ⏎ AbstractVector{ScientificTypesBase.Count}
preds_test = predict(my_model, X_test)
1459-element Vector{Float64}: 123619.0 154524.0 186848.0 180457.0 184994.48 200124.7 173519.0 ⋮ 88614.0 86494.0 86165.0 153198.28 135044.5 232758.22
output = DataFrame("Id" => X_test_full.Id,
"SalePrice" => preds_test)
Id | SalePrice | |
---|---|---|
1 | 1461 | 123619.0 |
2 | 1462 | 154524.0 |
3 | 1463 | 186848.0 |
4 | 1464 | 180457.0 |
5 | 1465 | 1.84994e5 |
6 | 1466 | 2.00125e5 |
7 | 1467 | 173519.0 |
8 | 1468 | 1.74786e5 |
9 | 1469 | 186665.0 |
10 | 1470 | 1.18057e5 |
... | ||
1459 | 2919 | 2.32758e5 |
CSV.write("data//home-data-for-ml-course//submissions_02x01.csv", output)
"data//home-data-for-ml-course//submissions_02x01.csv"
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/notebook02x01.jl) and open it with Pluto.jl._