using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
begin
    iowa_file_path = "data//home-data-for-ml-course//train.csv"
    home_data = CSV.read(iowa_file_path, DataFrame)
    y = home_data.SalePrice
    feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd]
    X = home_data[:, feature_names]
end
LotAreaYearBuilt1stFlrSF2ndFlrSFFullBathBedroomAbvGrTotRmsAbvGrd
184502003856854238
29600197612620236
3112502001920866236
495501915961756137
514260200011451053249
6141151993796566115
710084200416940237
81038219731107983237
9612019311022752228
107420193910770125
...
14609937196512560136
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
RandomForestRegressor
forest = Forest()
RandomForestRegressor(
  max_depth = -1, 
  min_samples_leaf = 1, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = -1, 
  n_trees = 100, 
  sampling_fraction = 0.7, 
  feature_importance = :impurity, 
  rng = Random._GLOBAL_RNG())
mach = machine(forest, X, y, scitype_check_level=0)
untrained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach)
trained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}
test_data_path = "data//home-data-for-ml-course//test.csv"
"data//home-data-for-ml-course//test.csv"
test_data = CSV.read(test_data_path, DataFrame)
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShape...
1146120"RH""80"11622"Pave""NA""Reg"
2146220"RL""81"14267"Pave""NA""IR1"
3146360"RL""74"13830"Pave""NA""IR1"
4146460"RL""78"9978"Pave""NA""IR1"
51465120"RL""43"5005"Pave""NA""IR1"
6146660"RL""75"10000"Pave""NA""IR1"
7146720"RL""NA"7980"Pave""NA""IR1"
8146860"RL""63"8402"Pave""NA""IR1"
9146920"RL""85"10176"Pave""NA""Reg"
10147020"RL""70"8400"Pave""NA""Reg"
...
1459291960"RL""74"9627"Pave""NA""Reg"
test_X = test_data[:,feature_names];
test_preds = predict(mach, test_X)
1459-element Vector{Float64}:
 124402.0
 156678.5
 189795.02
 181392.5
 184786.04
 199828.75
 174828.0
      ⋮
  87690.0
  86760.0
  86984.0
 156340.46
 133946.5
 238451.1
output = DataFrame(:ID => test_data.Id,:SalePrice => test_preds)
IDSalePrice
11461124402.0
214621.56678e5
31463189795.0
414641.81392e5
51465184786.0
614661.99829e5
71467174828.0
814681.75705e5
914691.89534e5
1014701.13977e5
...
145929192.38451e5
CSV.write("data//home-data-for-ml-course//submissions.csv", output)
"data//home-data-for-ml-course//submissions.csv"
ŷ = predict(mach, X)
1460-element Vector{Float64}:
 207936.81
 175583.0
 216197.0
 144564.75
 255722.28
 149060.02
 276293.93
      ⋮
 184721.05
 176207.0
 224053.0
 233962.0
 133793.75
 153000.5
mean_absolute_error(ŷ, y)
11248.703209589035

Built with Julia 1.9.1 and

CSV 0.10.11
DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/randomforestcompetition01x04.jl) and open it with Pluto.jl._