using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
using Statistics, Dates
iowa_file_path = "data//home-data-for-ml-course//train.csv"
"data//home-data-for-ml-course//train.csv"
home_data = CSV.read(iowa_file_path, DataFrame);
describe(home_data)
variable | mean | min | median | max | nmissing | eltype | |
---|---|---|---|---|---|---|---|
1 | :Id | 730.5 | 1 | 730.5 | 1460 | 0 | Int64 |
2 | :MSSubClass | 56.8973 | 20 | 50.0 | 190 | 0 | Int64 |
3 | :MSZoning | nothing | "C (all)" | nothing | "RM" | 0 | String7 |
4 | :LotFrontage | nothing | "100" | nothing | "NA" | 0 | String3 |
5 | :LotArea | 10516.8 | 1300 | 9478.5 | 215245 | 0 | Int64 |
6 | :Street | nothing | "Grvl" | nothing | "Pave" | 0 | String7 |
7 | :Alley | nothing | "Grvl" | nothing | "Pave" | 0 | String7 |
8 | :LotShape | nothing | "IR1" | nothing | "Reg" | 0 | String3 |
9 | :LandContour | nothing | "Bnk" | nothing | "Lvl" | 0 | String3 |
10 | :Utilities | nothing | "AllPub" | nothing | "NoSeWa" | 0 | String7 |
... | |||||||
81 | :SalePrice | 1.80921e5 | 34900 | 163000.0 | 755000 | 0 | Int64 |
10517.0
13
names(home_data)
81-element Vector{String}: "Id" "MSSubClass" "MSZoning" "LotFrontage" "LotArea" "Street" "Alley" ⋮ "MiscVal" "MoSold" "YrSold" "SaleType" "SaleCondition" "SalePrice"
y = home_data.SalePrice;
feature_names = [:LotArea, :YearBuilt, Symbol("1stFlrSF"), Symbol("2ndFlrSF"), :FullBath, :BedroomAbvGr, :TotRmsAbvGrd];
X = home_data[:, feature_names];
describe(X)
variable | mean | min | median | max | nmissing | eltype | |
---|---|---|---|---|---|---|---|
1 | :LotArea | 10516.8 | 1300 | 9478.5 | 215245 | 0 | Int64 |
2 | :YearBuilt | 1971.27 | 1872 | 1973.0 | 2010 | 0 | Int64 |
3 | Symbol("1stFlrSF") | 1162.63 | 334 | 1087.0 | 4692 | 0 | Int64 |
4 | Symbol("2ndFlrSF") | 346.992 | 0 | 0.0 | 2065 | 0 | Int64 |
5 | :FullBath | 1.56507 | 0 | 2.0 | 3 | 0 | Int64 |
6 | :BedroomAbvGr | 2.86644 | 0 | 3.0 | 8 | 0 | Int64 |
7 | :TotRmsAbvGrd | 6.51781 | 2 | 6.0 | 14 | 0 | Int64 |
Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
DecisionTreeRegressor
iowa_model = Tree(rng = 123)
DecisionTreeRegressor( max_depth = -1, min_samples_leaf = 5, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = 0, post_prune = false, merge_purity_threshold = 1.0, feature_importance = :impurity, rng = 123)
mach = machine(iowa_model, X, y, scitype_check_level=0)
untrained Machine; caches model-specific representations of data model: DecisionTreeRegressor(max_depth = -1, …) args: 1: Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach)
trained Machine; caches model-specific representations of data model: DecisionTreeRegressor(max_depth = -1, …) args: 1: Source @890 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @450 ⏎ AbstractVector{ScientificTypesBase.Count}
predictions = predict(mach, X)
1460-element Vector{Float64}: 212000.0 163600.0 207066.66666666666 127600.0 264816.6666666667 140375.0 264580.0 ⋮ 182591.42857142858 175700.0 225066.66666666666 256480.0 133045.0 154485.7142857143
DataFrame("predictions"=> round.(Int,predictions), "y" => y)
predictions | y | |
---|---|---|
1 | 212000 | 208500 |
2 | 163600 | 181500 |
3 | 207067 | 223500 |
4 | 127600 | 140000 |
5 | 264817 | 250000 |
6 | 140375 | 143000 |
7 | 264580 | 307000 |
8 | 198567 | 200000 |
9 | 112329 | 129900 |
10 | 129650 | 118000 |
... | ||
1460 | 154486 | 147500 |
begin
println("First in-sample predictions:", predictions[1:5])
println("Actual target values for those homes:", y[1:5])
end
mean_absolute_error(predictions,y)
16686.449680908874
(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)
((1168×7 DataFrame Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd │ Int64 Int64 Int64 Int64 Int64 Int64 Int64 ──────┼────────────────────────────────────────────────────────────────────────────── 1 │ 15865 1970 2217 0 2 4 8 2 │ 9920 1969 971 0 1 3 5 3 │ 8963 1976 1175 1540 3 4 11 4 │ 7094 1966 894 0 1 3 5 5 │ 10530 1971 981 0 1 3 5 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ 1165 │ 8125 2006 778 798 2 3 6 1166 │ 8250 1964 1092 0 1 3 6 1167 │ 7082 1916 948 980 2 5 10 1168 │ 11160 1968 2110 0 2 3 8 1159 rows omitted, 292×7 DataFrame Row │ LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr TotRmsAbvGrd │ Int64 Int64 Int64 Int64 Int64 Int64 Int64 ─────┼────────────────────────────────────────────────────────────────────────────── 1 │ 9947 1990 1217 0 2 3 6 2 │ 8712 1957 1306 0 1 2 5 3 │ 14191 2002 993 915 2 4 9 4 │ 39104 1954 1363 0 1 2 5 5 │ 10678 1992 2129 743 2 4 9 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ 289 │ 17503 1948 912 546 1 3 6 290 │ 4270 1931 774 0 1 3 6 291 │ 11228 1993 1080 1017 2 3 9 292 │ 19296 1962 1382 0 1 3 6 283 rows omitted), ([268000, 128500, 299800, 125000, 143250, 134900, 110000, 170000, 176500, 154000 … 240000, 135000, 140000, 222000, 235000, 302000, 197000, 145000, 160000, 244000], [173000, 153000, 202900, 241500, 285000, 354000, 237000, 152000, 108000, 140000 … 180000, 223000, 130500, 175000, 156500, 140000, 97500, 79000, 228000, 176000]))
iowa_model1 = Tree(rng = 123)
DecisionTreeRegressor( max_depth = -1, min_samples_leaf = 5, min_samples_split = 2, min_purity_increase = 0.0, n_subfeatures = 0, post_prune = false, merge_purity_threshold = 1.0, feature_importance = :impurity, rng = 123)
mach1 = machine(iowa_model1, Xtrain, ytrain, scitype_check_level=0)
untrained Machine; caches model-specific representations of data model: DecisionTreeRegressor(max_depth = -1, …) args: 1: Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}
fit!(mach1)
trained Machine; caches model-specific representations of data model: DecisionTreeRegressor(max_depth = -1, …) args: 1: Source @162 ⏎ ScientificTypesBase.Table{AbstractVector{ScientificTypesBase.Count}} 2: Source @231 ⏎ AbstractVector{ScientificTypesBase.Count}
val_predictions = predict(mach1, Xtest)
292-element Vector{Float64}: 182000.0 136500.0 244698.88888888888 147570.22222222222 369939.6 302241.4285714286 244698.88888888888 ⋮ 157044.44444444444 136935.0 143812.5 64757.142857142855 244500.0 147570.22222222222
DataFrame("val_predictions"=> round.(Int,val_predictions[1:5]), "val_y" => ytest[1:5])
val_predictions | val_y | |
---|---|---|
1 | 182000 | 173000 |
2 | 136500 | 153000 |
3 | 244699 | 202900 |
4 | 147570 | 241500 |
5 | 369940 | 285000 |
mean_absolute_error(val_predictions, ytest)
26826.471499238956
Built with Julia 1.9.1 and
CSV 0.10.9DataFrames 1.5.0
MLJ 0.19.1
MLJDecisionTreeInterface 0.4.0
Statistics 1.9.0
To run this tutorial locally, download [this file](/tutorials/exploringdata01x01.jl) and open it with Pluto.jl.