using DataFrames, CSV, MLJ, MLJDecisionTreeInterfaceX_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);y = X_full.SalePrice;features = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];X = copy(X_full[:,features]);X_test = copy(X_test_full[:,features]);(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true)((1168×7 DataFrame
  Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
      │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
──────┼──────────────────────────────────────────────────────────────────────────────
    1 │    8450       2003       856       854         2             3             8
    2 │    9600       1976      1262         0         2             3             6
    3 │   11250       2001       920       866         2             3             6
    4 │    9550       1915       961       756         1             3             7
    5 │   14260       2000      1145      1053         2             4             9
  ⋮   │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 1165 │   16157       1978      1432         0         1             2             5
 1166 │    9541       2009      1502         0         2             3             7
 1167 │   10475       2008      1694         0         2             3             7
 1168 │   10852       2000       959       712         2             3             7
                                                                    1159 rows omitted, 292×7 DataFrame
 Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
     │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
─────┼──────────────────────────────────────────────────────────────────────────────
   1 │   13728       1935      1236       872         2             4             7
   2 │   35760       1995      1831      1796         3             4            10
   3 │    9880       1977      1118         0         1             3             6
   4 │    9120       1958      1261         0         1             3             6
   5 │    4017       2006       625       625         2             2             5
  ⋮  │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 289 │   13175       1978      2073         0         2             3             7
 290 │    9042       1941      1188      1152         2             4             9
 291 │    9717       1950      1078         0         1             2             5
 292 │    9937       1965      1256         0         1             3             6
                                                                    283 rows omitted), ([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000  …  235128, 185000, 146000, 224000, 129000, 108959, 194000, 233170, 245350, 173000], [235000, 625000, 171000, 163000, 171900, 200500, 239000, 285000, 119500, 115000  …  136000, 287090, 145000, 84500, 185000, 175000, 210000, 266500, 142125, 147500]))
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity = 0RandomForestRegressor
model_1 = Forest(n_trees=50, rng=0);model_2 = Forest(n_trees=100, rng=0);model_3 = Forest(n_trees=100, feature_importance= :split, rng=0);model_4 = Forest(n_trees=200, min_samples_split=20, rng=0);model_5 = Forest(n_trees=100, max_depth=7, rng=0); models = [model_1, model_2, model_3, model_4, model_5];function score_model(model; X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid)
    mach = machine(model, X_t, y_t, scitype_check_level=0)
    fit!(mach, verbosity=0)
    preds = predict(mach, X_v)
    return mean_absolute_error(preds, y_v)
endscore_model (generic function with 1 method)
for i in 1:length(models)
    mae = score_model(models[i])
    println("Model $(i) MAE: $(mae)")
endbest_model = model_3;my_model = machine(best_model, X, y, scitype_check_level=0);fit!(my_model)trained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @493 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @846 ⏎ AbstractVector{ScientificTypesBase.Count}
preds_test = predict(my_model, X_test)1459-element Vector{Float64}:
 123619.0
 154524.0
 186848.0
 180457.0
 184994.48
 200124.7
 173519.0
      ⋮
  88614.0
  86494.0
  86165.0
 153198.28
 135044.5
 232758.22
output = DataFrame("Id" => X_test_full.Id,
                       "SalePrice" => preds_test)| Id | SalePrice | |
|---|---|---|
| 1 | 1461 | 123619.0 | 
| 2 | 1462 | 154524.0 | 
| 3 | 1463 | 186848.0 | 
| 4 | 1464 | 180457.0 | 
| 5 | 1465 | 1.84994e5 | 
| 6 | 1466 | 2.00125e5 | 
| 7 | 1467 | 173519.0 | 
| 8 | 1468 | 1.74786e5 | 
| 9 | 1469 | 186665.0 | 
| 10 | 1470 | 1.18057e5 | 
| ... | ||
| 1459 | 2919 | 2.32758e5 | 
CSV.write("data//home-data-for-ml-course//submissions_02x01.csv", output)"data//home-data-for-ml-course//submissions_02x01.csv"
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/notebook02x01.jl) and open it with Pluto.jl._