using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);
X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);
y = X_full.SalePrice;
features = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = copy(X_full[:,features]);
X_test = copy(X_test_full[:,features]);
(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true)
((1168×7 DataFrame
  Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
      │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
──────┼──────────────────────────────────────────────────────────────────────────────
    1 │    8450       2003       856       854         2             3             8
    2 │    9600       1976      1262         0         2             3             6
    3 │   11250       2001       920       866         2             3             6
    4 │    9550       1915       961       756         1             3             7
    5 │   14260       2000      1145      1053         2             4             9
  ⋮   │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 1165 │   16157       1978      1432         0         1             2             5
 1166 │    9541       2009      1502         0         2             3             7
 1167 │   10475       2008      1694         0         2             3             7
 1168 │   10852       2000       959       712         2             3             7
                                                                    1159 rows omitted, 292×7 DataFrame
 Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
     │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
─────┼──────────────────────────────────────────────────────────────────────────────
   1 │   13728       1935      1236       872         2             4             7
   2 │   35760       1995      1831      1796         3             4            10
   3 │    9880       1977      1118         0         1             3             6
   4 │    9120       1958      1261         0         1             3             6
   5 │    4017       2006       625       625         2             2             5
  ⋮  │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 289 │   13175       1978      2073         0         2             3             7
 290 │    9042       1941      1188      1152         2             4             9
 291 │    9717       1950      1078         0         1             2             5
 292 │    9937       1965      1256         0         1             3             6
                                                                    283 rows omitted), ([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000  …  235128, 185000, 146000, 224000, 129000, 108959, 194000, 233170, 245350, 173000], [235000, 625000, 171000, 163000, 171900, 200500, 239000, 285000, 119500, 115000  …  136000, 287090, 145000, 84500, 185000, 175000, 210000, 266500, 142125, 147500]))
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity = 0
RandomForestRegressor
model_1 = Forest(n_trees=50, rng=0);
model_2 = Forest(n_trees=100, rng=0);
model_3 = Forest(n_trees=100, feature_importance= :split, rng=0);
model_4 = Forest(n_trees=200, min_samples_split=20, rng=0);
model_5 = Forest(n_trees=100, max_depth=7, rng=0); 
models = [model_1, model_2, model_3, model_4, model_5];
function score_model(model; X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid)
    mach = machine(model, X_t, y_t, scitype_check_level=0)
    fit!(mach, verbosity=0)
    preds = predict(mach, X_v)
    return mean_absolute_error(preds, y_v)
end
score_model (generic function with 1 method)
for i in 1:length(models)
    mae = score_model(models[i])
    println("Model $(i) MAE: $(mae)")
end
best_model = model_3;
my_model = machine(best_model, X, y, scitype_check_level=0);
fit!(my_model)
trained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @493 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @846 ⏎ AbstractVector{ScientificTypesBase.Count}
preds_test = predict(my_model, X_test)
1459-element Vector{Float64}:
 123619.0
 154524.0
 186848.0
 180457.0
 184994.48
 200124.7
 173519.0
      ⋮
  88614.0
  86494.0
  86165.0
 153198.28
 135044.5
 232758.22
output = DataFrame("Id" => X_test_full.Id,
                       "SalePrice" => preds_test)
IdSalePrice
11461123619.0
21462154524.0
31463186848.0
41464180457.0
514651.84994e5
614662.00125e5
71467173519.0
814681.74786e5
91469186665.0
1014701.18057e5
...
145929192.32758e5
CSV.write("data//home-data-for-ml-course//submissions_02x01.csv", output)
"data//home-data-for-ml-course//submissions_02x01.csv"

Built with Julia 1.9.1 and

CSV 0.10.11
DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/notebook02x01.jl) and open it with Pluto.jl._