- plutoTemplate

using DataFrames, CSV, MLJ, MLJDecisionTreeInterface

begin
    iowa_file_path = "data//home-data-for-ml-course//train.csv"
    home_data = CSV.read(iowa_file_path, DataFrame)
    y = home_data.SalePrice
    feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd]
    X = home_data[:, feature_names]
end

	LotArea	YearBuilt	1stFlrSF	2ndFlrSF	FullBath	BedroomAbvGr	TotRmsAbvGrd
1	8450	2003	856	854	2	3	8
2	9600	1976	1262	0	2	3	6
3	11250	2001	920	866	2	3	6
4	9550	1915	961	756	1	3	7
5	14260	2000	1145	1053	2	4	9
6	14115	1993	796	566	1	1	5
7	10084	2004	1694	0	2	3	7
8	10382	1973	1107	983	2	3	7
9	6120	1931	1022	752	2	2	8
10	7420	1939	1077	0	1	2	5
...
1460	9937	1965	1256	0	1	3	6

Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0

RandomForestRegressor

forest = Forest()

RandomForestRegressor(
  max_depth = -1, 
  min_samples_leaf = 1, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = -1, 
  n_trees = 100, 
  sampling_fraction = 0.7, 
  feature_importance = :impurity, 
  rng = Random._GLOBAL_RNG())

mach = machine(forest, X, y, scitype_check_level=0)

untrained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}

fit!(mach)

trained Machine; caches model-specific representations of data
  model: RandomForestRegressor(max_depth = -1, …)
  args: 
    1:	Source @358 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @253 ⏎ AbstractVector{ScientificTypesBase.Count}

test_data_path = "data//home-data-for-ml-course//test.csv"

"data//home-data-for-ml-course//test.csv"

test_data = CSV.read(test_data_path, DataFrame)

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape
1	1461	20	"RH"	"80"	11622	"Pave"	"NA"	"Reg"
2	1462	20	"RL"	"81"	14267	"Pave"	"NA"	"IR1"
3	1463	60	"RL"	"74"	13830	"Pave"	"NA"	"IR1"
4	1464	60	"RL"	"78"	9978	"Pave"	"NA"	"IR1"
5	1465	120	"RL"	"43"	5005	"Pave"	"NA"	"IR1"
6	1466	60	"RL"	"75"	10000	"Pave"	"NA"	"IR1"
7	1467	20	"RL"	"NA"	7980	"Pave"	"NA"	"IR1"
8	1468	60	"RL"	"63"	8402	"Pave"	"NA"	"IR1"
9	1469	20	"RL"	"85"	10176	"Pave"	"NA"	"Reg"
10	1470	20	"RL"	"70"	8400	"Pave"	"NA"	"Reg"
...
1459	2919	60	"RL"	"74"	9627	"Pave"	"NA"	"Reg"

test_X = test_data[:,feature_names];

test_preds = predict(mach, test_X)

1459-element Vector{Float64}:
 124402.0
 156678.5
 189795.02
 181392.5
 184786.04
 199828.75
 174828.0
      ⋮
  87690.0
  86760.0
  86984.0
 156340.46
 133946.5
 238451.1

output = DataFrame(:ID => test_data.Id,:SalePrice => test_preds)

	ID	SalePrice
1	1461	124402.0
2	1462	1.56678e5
3	1463	189795.0
4	1464	1.81392e5
5	1465	184786.0
6	1466	1.99829e5
7	1467	174828.0
8	1468	1.75705e5
9	1469	1.89534e5
10	1470	1.13977e5
...
1459	2919	2.38451e5

CSV.write("data//home-data-for-ml-course//submissions.csv", output)

"data//home-data-for-ml-course//submissions.csv"

ŷ = predict(mach, X)

1460-element Vector{Float64}:
 207936.81
 175583.0
 216197.0
 144564.75
 255722.28
 149060.02
 276293.93
      ⋮
 184721.05
 176207.0
 224053.0
 233962.0
 133793.75
 153000.5

mean_absolute_error(ŷ, y)

11248.703209589035

Built with Julia 1.9.1 and

CSV 0.10.11
DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/randomforestcompetition01x04.jl) and open it with Pluto.jl._

Julia Tutorials Template