using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
using Statistics, Dates
iowa_file_path = "data//home-data-for-ml-course//train.csv"
"data//home-data-for-ml-course//train.csv"
home_data = CSV.read(iowa_file_path, DataFrame);
describe(home_data)
variablemeanminmedianmaxnmissingeltype
1:Id730.51730.514600Int64
2:MSSubClass56.89732050.01900Int64
3:MSZoningnothing"C (all)"nothing"RM"0String7
4:LotFrontagenothing"100"nothing"NA"0String3
5:LotArea10516.813009478.52152450Int64
6:Streetnothing"Grvl"nothing"Pave"0String7
7:Alleynothing"Grvl"nothing"Pave"0String7
8:LotShapenothing"IR1"nothing"Reg"0String3
9:LandContournothing"Bnk"nothing"Lvl"0String3
10:Utilitiesnothing"AllPub"nothing"NoSeWa"0String7
...
81:SalePrice1.80921e534900163000.07550000Int64
10517.0
13
names(home_data)
81-element Vector{String}:
 "Id"
 "MSSubClass"
 "MSZoning"
 "LotFrontage"
 "LotArea"
 "Street"
 "Alley"
 ⋮
 "MiscVal"
 "MoSold"
 "YrSold"
 "SaleType"
 "SaleCondition"
 "SalePrice"
y = home_data.SalePrice;
feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = home_data[:, feature_names];
describe(X)
variablemeanminmedianmaxnmissingeltype
1:LotArea10516.813009478.52152450Int64
2:YearBuilt1971.2718721973.020100Int64
3Symbol("1stFlrSF")1162.633341087.046920Int64
4Symbol("2ndFlrSF")346.99200.020650Int64
5:FullBath1.5650702.030Int64
6:BedroomAbvGr2.8664403.080Int64
7:TotRmsAbvGrd6.5178126.0140Int64
Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
DecisionTreeRegressor
iowa_model = Tree(rng = 123)
DecisionTreeRegressor(
  max_depth = -1, 
  min_samples_leaf = 5, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  feature_importance = :impurity, 
  rng = 123)
mach = machine(iowa_model, X, y, scitype_check_level=0)
untrained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach)
trained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}
predictions = predict(mach, X)
1460-element Vector{Float64}:
 212000.0
 163600.0
 207066.66666666666
 127600.0
 264816.6666666667
 140375.0
 264580.0
      ⋮
 182591.42857142858
 175700.0
 225066.66666666666
 256480.0
 133045.0
 154485.7142857143
DataFrame("predictions"=> round.(Int,predictions), "y" => y)
predictionsy
1212000208500
2163600181500
3207067223500
4127600140000
5264817250000
6140375143000
7264580307000
8198567200000
9112329129900
10129650118000
...
1460154486147500
begin
    println("First in-sample predictions:", predictions[1:5])
    println("Actual target values for those homes:", y[1:5])
end
mean_absolute_error(predictions,y)
16686.449680908874
(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)
((1168×7 DataFrame
  Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
      │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
──────┼──────────────────────────────────────────────────────────────────────────────
    1 │   15865       1970      2217         0         2             4             8
    2 │    9920       1969       971         0         1             3             5
    3 │    8963       1976      1175      1540         3             4            11
    4 │    7094       1966       894         0         1             3             5
    5 │   10530       1971       981         0         1             3             5
  ⋮   │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 1165 │    8125       2006       778       798         2             3             6
 1166 │    8250       1964      1092         0         1             3             6
 1167 │    7082       1916       948       980         2             5            10
 1168 │   11160       1968      2110         0         2             3             8
                                                                    1159 rows omitted, 292×7 DataFrame
 Row │ LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  TotRmsAbvGrd
     │ Int64    Int64      Int64     Int64     Int64     Int64         Int64
─────┼──────────────────────────────────────────────────────────────────────────────
   1 │    9947       1990      1217         0         2             3             6
   2 │    8712       1957      1306         0         1             2             5
   3 │   14191       2002       993       915         2             4             9
   4 │   39104       1954      1363         0         1             2             5
   5 │   10678       1992      2129       743         2             4             9
  ⋮  │    ⋮         ⋮         ⋮         ⋮         ⋮           ⋮             ⋮
 289 │   17503       1948       912       546         1             3             6
 290 │    4270       1931       774         0         1             3             6
 291 │   11228       1993      1080      1017         2             3             9
 292 │   19296       1962      1382         0         1             3             6
                                                                    283 rows omitted), ([268000, 128500, 299800, 125000, 143250, 134900, 110000, 170000, 176500, 154000  …  240000, 135000, 140000, 222000, 235000, 302000, 197000, 145000, 160000, 244000], [173000, 153000, 202900, 241500, 285000, 354000, 237000, 152000, 108000, 140000  …  180000, 223000, 130500, 175000, 156500, 140000, 97500, 79000, 228000, 176000]))
iowa_model1 = Tree(rng = 123)
DecisionTreeRegressor(
  max_depth = -1, 
  min_samples_leaf = 5, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  feature_importance = :impurity, 
  rng = 123)
mach1 = machine(iowa_model1, Xtrain, ytrain, scitype_check_level=0)
untrained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach1)
trained Machine; caches model-specific representations of data
  model: DecisionTreeRegressor(max_depth = -1, …)
  args: 
    1:	Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}}
    2:	Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}
val_predictions = predict(mach1, Xtest)
292-element Vector{Float64}:
 182000.0
 136500.0
 244698.88888888888
 147570.22222222222
 369939.6
 302241.4285714286
 244698.88888888888
      ⋮
 157044.44444444444
 136935.0
 143812.5
  64757.142857142855
 244500.0
 147570.22222222222
DataFrame("val_predictions"=> round.(Int,val_predictions[1:5]), "val_y" => ytest[1:5])
val_predictionsval_y
1182000173000
2136500153000
3244699202900
4147570241500
5369940285000
mean_absolute_error(val_predictions, ytest)
26826.471499238956

Built with Julia 1.9.1 and

CSV 0.10.9
DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Statistics 1.9.0

To run this tutorial locally, download [this file](/tutorials/exploringdata01x01.jl) and open it with Pluto.jl.