- plutoTemplate

using DataFrames, CSV, MLJ, MLJDecisionTreeInterface

using Statistics, Dates

iowa_file_path = "data//home-data-for-ml-course//train.csv"

"data//home-data-for-ml-course//train.csv"

home_data = CSV.read(iowa_file_path, DataFrame);

describe(home_data)

	variable	mean	min	median	max	nmissing	eltype
1	:Id	730.5	1	730.5	1460	0	Int64
2	:MSSubClass	56.8973	20	50.0	190	0	Int64
3	:MSZoning	nothing	"C (all)"	nothing	"RM"	0	String7
4	:LotFrontage	nothing	"100"	nothing	"NA"	0	String3
5	:LotArea	10516.8	1300	9478.5	215245	0	Int64
6	:Street	nothing	"Grvl"	nothing	"Pave"	0	String7
7	:Alley	nothing	"Grvl"	nothing	"Pave"	0	String7
8	:LotShape	nothing	"IR1"	nothing	"Reg"	0	String3
9	:LandContour	nothing	"Bnk"	nothing	"Lvl"	0	String3
10	:Utilities	nothing	"AllPub"	nothing	"NoSeWa"	0	String7
...
81	:SalePrice	1.80921e5	34900	163000.0	755000	0	Int64

10517.0

names(home_data)

81-element Vector{String}:
 "Id"
 "MSSubClass"
 "MSZoning"
 "LotFrontage"
 "LotArea"
 "Street"
 "Alley"
 ⋮
 "MiscVal"
 "MoSold"
 "YrSold"
 "SaleType"
 "SaleCondition"
 "SalePrice"

y = home_data.SalePrice;

feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];

X = home_data[:, feature_names];

describe(X)

	variable	mean	min	median	max	eltype
1	:LotArea	10516.8	1300	9478.5	215245	Int64
2	:YearBuilt	1971.27	1872	1973.0	2010	Int64
3	Symbol("1stFlrSF")	1162.63	334	1087.0	4692	Int64
4	Symbol("2ndFlrSF")	346.992	0	0.0	2065	Int64
5	:FullBath	1.56507	0	2.0	3	Int64
6	:BedroomAbvGr	2.86644	0	3.0	8	Int64
7	:TotRmsAbvGrd	6.51781	2	6.0	14	Int64

Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0

DecisionTreeRegressor

iowa_model = Tree(rng = 123)

DecisionTreeRegressor(
  max_depth = -1, 
  min_samples_leaf = 5, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  feature_importance = :impurity, 
  rng = 123)

mach = machine(iowa_model, X, y, scitype_check_level=0)

untrained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}

fit!(mach)

trained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}

predictions = predict(mach, X)

1460-element Vector{Float64}:
 212000.0
 163600.0
 207066.66666666666
 127600.0
 264816.6666666667
 140375.0
 264580.0
      ⋮
 182591.42857142858
 175700.0
 225066.66666666666
 256480.0
 133045.0
 154485.7142857143

DataFrame("predictions"=> round.(Int,predictions), "y" => y)

	predictions	y
1	212000	208500
2	163600	181500
3	207067	223500
4	127600	140000
5	264817	250000
6	140375	143000
7	264580	307000
8	198567	200000
9	112329	129900
10	129650	118000
...
1460	154486	147500

begin
    println("First in-sample predictions:", predictions[1:5])
    println("Actual target values for those homes:", y[1:5])
end

mean_absolute_error(predictions,y)

16686.449680908874

(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)

((1168×7 DataFrame
  Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
      │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
──────┼──────────────────────────────────────────────────────────────────────────────
    1 │   15865       1970      2217         0         2             4             8
    2 │    9920       1969       971         0         1             3             5
    3 │    8963       1976      1175      1540         3             4            11
    4 │    7094       1966       894         0         1             3             5
    5 │   10530       1971       981         0         1             3             5
  ⋮   │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 1165 │    8125       2006       778       798         2             3             6
 1166 │    8250       1964      1092         0         1             3             6
 1167 │    7082       1916       948       980         2             5            10
 1168 │   11160       1968      2110         0         2             3             8
                                                                    1159 rows omitted, 292×7 DataFrame
 Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
     │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
─────┼──────────────────────────────────────────────────────────────────────────────
   1 │    9947       1990      1217         0         2             3             6
   2 │    8712       1957      1306         0         1             2             5
   3 │   14191       2002       993       915         2             4             9
   4 │   39104       1954      1363         0         1             2             5
   5 │   10678       1992      2129       743         2             4             9
  ⋮  │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 289 │   17503       1948       912       546         1             3             6
 290 │    4270       1931       774         0         1             3             6
 291 │   11228       1993      1080      1017         2             3             9
 292 │   19296       1962      1382         0         1             3             6
                                                                    283 rows omitted), ([268000, 128500, 299800, 125000, 143250, 134900, 110000, 170000, 176500, 154000  …  240000, 135000, 140000, 222000, 235000, 302000, 197000, 145000, 160000, 244000], [173000, 153000, 202900, 241500, 285000, 354000, 237000, 152000, 108000, 140000  …  180000, 223000, 130500, 175000, 156500, 140000, 97500, 79000, 228000, 176000]))

iowa_model1 = Tree(rng = 123)

DecisionTreeRegressor(
  max_depth = -1, 
  min_samples_leaf = 5, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  feature_importance = :impurity, 
  rng = 123)

mach1 = machine(iowa_model1, Xtrain, ytrain, scitype_check_level=0)

untrained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}

fit!(mach1)

trained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}

val_predictions = predict(mach1, Xtest)

292-element Vector{Float64}:
 182000.0
 136500.0
 244698.88888888888
 147570.22222222222
 369939.6
 302241.4285714286
 244698.88888888888
      ⋮
 157044.44444444444
 136935.0
 143812.5
  64757.142857142855
 244500.0
 147570.22222222222

DataFrame("val_predictions"=> round.(Int,val_predictions[1:5]), "val_y" => ytest[1:5])

	val_predictions	val_y
1	182000	173000
2	136500	153000
3	244699	202900
4	147570	241500
5	369940	285000

mean_absolute_error(val_predictions, ytest)

26826.471499238956

Built with Julia 1.9.1 and

CSV 0.10.9
DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Statistics 1.9.0

To run this tutorial locally, download [this file](/tutorials/exploringdata01x01.jl) and open it with Pluto.jl.

Julia Tutorials Template