using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);
X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);
y = X_full.SalePrice;
features = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = copy(X_full[:,features]);
X_test = copy(X_test_full[:,features]);
(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true)
((1168×7 DataFrame
Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd
│ Int64 Int64 Int64 Int64 Int64 Int64 Int64
──────┼──────────────────────────────────────────────────────────────────────────────
1 │ 8450 2003 856 854 2 3 8
2 │ 9600 1976 1262 0 2 3 6
3 │ 11250 2001 920 866 2 3 6
4 │ 9550 1915 961 756 1 3 7
5 │ 14260 2000 1145 1053 2 4 9
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮
1165 │ 16157 1978 1432 0 1 2 5
1166 │ 9541 2009 1502 0 2 3 7
1167 │ 10475 2008 1694 0 2 3 7
1168 │ 10852 2000 959 712 2 3 7
1159 rows omitted, 292×7 DataFrame
Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd
│ Int64 Int64 Int64 Int64 Int64 Int64 Int64
─────┼──────────────────────────────────────────────────────────────────────────────
1 │ 13728 1935 1236 872 2 4 7
2 │ 35760 1995 1831 1796 3 4 10
3 │ 9880 1977 1118 0 1 3 6
4 │ 9120 1958 1261 0 1 3 6
5 │ 4017 2006 625 625 2 2 5
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮
289 │ 13175 1978 2073 0 2 3 7
290 │ 9042 1941 1188 1152 2 4 9
291 │ 9717 1950 1078 0 1 2 5
292 │ 9937 1965 1256 0 1 3 6
283 rows omitted), ([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000 … 235128, 185000, 146000, 224000, 129000, 108959, 194000, 233170, 245350, 173000], [235000, 625000, 171000, 163000, 171900, 200500, 239000, 285000, 119500, 115000 … 136000, 287090, 145000, 84500, 185000, 175000, 210000, 266500, 142125, 147500]))
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity = 0
RandomForestRegressor
model_1 = Forest(n_trees=50, rng=0);
model_2 = Forest(n_trees=100, rng=0);
model_3 = Forest(n_trees=100, feature_importance= :split, rng=0);
model_4 = Forest(n_trees=200, min_samples_split=20, rng=0);
model_5 = Forest(n_trees=100, max_depth=7, rng=0);
models = [model_1, model_2, model_3, model_4, model_5];
function score_model(model; X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid)
mach = machine(model, X_t, y_t, scitype_check_level=0)
fit!(mach, verbosity=0)
preds = predict(mach, X_v)
return mean_absolute_error(preds, y_v)
end
score_model (generic function with 1 method)
for i in 1:length(models)
mae = score_model(models[i])
println("Model $(i) MAE: $(mae)")
end
best_model = model_3;
my_model = machine(best_model, X, y, scitype_check_level=0);
fit!(my_model)
trained Machine; caches model-specific representations of data
model: RandomForestRegressor(max_depth = -1, …)
args:
1: Source @493 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
2: Source @846 ⏎ AbstractVector{ScientificTypesBase.Count}
preds_test = predict(my_model, X_test)
1459-element Vector{Float64}:
123619.0
154524.0
186848.0
180457.0
184994.48
200124.7
173519.0
⋮
88614.0
86494.0
86165.0
153198.28
135044.5
232758.22
output = DataFrame("Id" => X_test_full.Id,
"SalePrice" => preds_test)
| Id | SalePrice | |
|---|---|---|
| 1 | 1461 | 123619.0 |
| 2 | 1462 | 154524.0 |
| 3 | 1463 | 186848.0 |
| 4 | 1464 | 180457.0 |
| 5 | 1465 | 1.84994e5 |
| 6 | 1466 | 2.00125e5 |
| 7 | 1467 | 173519.0 |
| 8 | 1468 | 1.74786e5 |
| 9 | 1469 | 186665.0 |
| 10 | 1470 | 1.18057e5 |
| ... | ||
| 1459 | 2919 | 2.32758e5 |
CSV.write("data//home-data-for-ml-course//submissions_02x01.csv", output)
"data//home-data-for-ml-course//submissions_02x01.csv"
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/notebook02x01.jl) and open it with Pluto.jl._