using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);
X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);
describe(X_full);
#subset!(X_full, :SalePrice => ByRow(!=("missing")))
# filter SalePrice by row in place
dropmissing!(X_full, Cols(:SalePrice));
y = X_full.SalePrice;
Fixing data type
convert String type to Int type and missing
X_full.LotFrontage = something.(tryparse.(Int, X_full.LotFrontage), missing);
X_full.MasVnrArea = something.(tryparse.(Int, X_full.MasVnrArea), missing);
X_full.GarageYrBlt = something.(tryparse.(Int, X_full.GarageYrBlt), missing);
select!(X_full, Not(:SalePrice)); # drop column SalePrice in Place
size(X_full)
(1460, 80)
X = select(X_full, Cols(x -> eltype(X_full[:, x]) <: Union{Number,Missing})); #
X_full
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | ... | |
---|---|---|---|---|---|---|---|---|---|
1 | 1 | 60 | "RL" | 65 | 8450 | "Pave" | "NA" | "Reg" | |
2 | 2 | 20 | "RL" | 80 | 9600 | "Pave" | "NA" | "Reg" | |
3 | 3 | 60 | "RL" | 68 | 11250 | "Pave" | "NA" | "IR1" | |
4 | 4 | 70 | "RL" | 60 | 9550 | "Pave" | "NA" | "IR1" | |
5 | 5 | 60 | "RL" | 84 | 14260 | "Pave" | "NA" | "IR1" | |
6 | 6 | 50 | "RL" | 85 | 14115 | "Pave" | "NA" | "IR1" | |
7 | 7 | 20 | "RL" | 75 | 10084 | "Pave" | "NA" | "Reg" | |
8 | 8 | 60 | "RL" | missing | 10382 | "Pave" | "NA" | "IR1" | |
9 | 9 | 50 | "RM" | 51 | 6120 | "Pave" | "NA" | "Reg" | |
10 | 10 | 190 | "RL" | 50 | 7420 | "Pave" | "NA" | "Reg" | |
... | |||||||||
1460 | 1460 | 20 | "RL" | 75 | 9937 | "Pave" | "NA" | "Reg" |
size(X)
(1460, 37)
begin
X_test_full.LotFrontage = something.(tryparse.(Int, X_test_full.LotFrontage), missing);
X_test_full.MasVnrArea = something.(tryparse.(Int, X_test_full.MasVnrArea), missing);
X_test_full.GarageYrBlt = something.(tryparse.(Int, X_test_full.GarageYrBlt), missing);
X_test_full.BsmtFinSF1 = something.(tryparse.(Int, X_test_full.BsmtFinSF1), missing);
X_test_full.BsmtFinSF2 = something.(tryparse.(Int, X_test_full.BsmtFinSF2), missing);
X_test_full.BsmtUnfSF = something.(tryparse.(Int, X_test_full.BsmtUnfSF), missing);
X_test_full.TotalBsmtSF = something.(tryparse.(Int, X_test_full.TotalBsmtSF), missing);
X_test_full.GarageCars = something.(tryparse.(Int, X_test_full.GarageCars), missing);
X_test_full.GarageArea = something.(tryparse.(Int, X_test_full.GarageArea), missing);
X_test_full.BsmtFullBath = something.(tryparse.(Int, X_test_full.BsmtFullBath), missing);
X_test_full.BsmtHalfBath = something.(tryparse.(Int, X_test_full.BsmtHalfBath), missing);
end
1459-element Vector{Union{Missing, Int64}}: 0 0 0 0 0 0 0 ⋮ 0 0 0 0 1 0
X_test_full
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | ... | |
---|---|---|---|---|---|---|---|---|---|
1 | 1461 | 20 | "RH" | 80 | 11622 | "Pave" | "NA" | "Reg" | |
2 | 1462 | 20 | "RL" | 81 | 14267 | "Pave" | "NA" | "IR1" | |
3 | 1463 | 60 | "RL" | 74 | 13830 | "Pave" | "NA" | "IR1" | |
4 | 1464 | 60 | "RL" | 78 | 9978 | "Pave" | "NA" | "IR1" | |
5 | 1465 | 120 | "RL" | 43 | 5005 | "Pave" | "NA" | "IR1" | |
6 | 1466 | 60 | "RL" | 75 | 10000 | "Pave" | "NA" | "IR1" | |
7 | 1467 | 20 | "RL" | missing | 7980 | "Pave" | "NA" | "IR1" | |
8 | 1468 | 60 | "RL" | 63 | 8402 | "Pave" | "NA" | "IR1" | |
9 | 1469 | 20 | "RL" | 85 | 10176 | "Pave" | "NA" | "Reg" | |
10 | 1470 | 20 | "RL" | 70 | 8400 | "Pave" | "NA" | "Reg" | |
... | |||||||||
1459 | 2919 | 60 | "RL" | 74 | 9627 | "Pave" | "NA" | "Reg" |
size(X_test_full)
(1459, 80)
X_test = select(X_test_full, Cols(x -> eltype(X_test_full[:, x]) <: Union{Number, Missing}));
size(X_test)
(1459, 37)
(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true, rng=0);
Preliminary Investigation
(1168, 37)
describe(X_train, :nmissing)
variable | nmissing | |
---|---|---|
1 | :Id | 0 |
2 | :MSSubClass | 0 |
3 | :LotFrontage | 212 |
4 | :LotArea | 0 |
5 | :OverallQual | 0 |
6 | :OverallCond | 0 |
7 | :YearBuilt | 0 |
8 | :YearRemodAdd | 0 |
9 | :MasVnrArea | 6 |
10 | :BsmtFinSF1 | 0 |
... | ||
37 | :YrSold | 0 |
# Function for comparing different approaches
function score_dataset(X_train, X_valid, y_train, y_valid)
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
model = Forest(n_trees=100, rng=0)
mach = machine(model, X_train, y_train, scitype_check_level=0)
fit!(mach, verbosity=0)
preds = predict(mach, X_valid)
return mean_absolute_error(preds, y_valid)
end
score_dataset (generic function with 1 method)
cols_with_missing = names(X_train, any.(ismissing, eachcol(X_train))) # pick columns that contain missing values
3-element Vector{String}: "LotFrontage" "MasVnrArea" "GarageYrBlt"
reduced_X_train = select(X_train, Not(cols_with_missing));
reduced_X_valid = select(X_valid, Not(cols_with_missing));
begin
println("MAE (Drop columns with missing values):")
println(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
end
my_imputer = FillImputer()
FillImputer( features = Symbol[], continuous_fill = MLJModels._median, count_fill = MLJModels._round_median, finite_fill = MLJModels._mode)
mach = machine(my_imputer, X_train) |> fit!
trained Machine; caches model-specific representations of data model: FillImputer(features = Symbol[], …) args: 1: Source @464 ⏎ ScientificTypesBase.Table{Union{AbstractVector{Union{Missing, ScientificTypesBase.Count}}, AbstractVector{ScientificTypesBase.Count}}}
imputed_X_train = MLJ.transform(mach, X_train);
imputed_X_valid = MLJ.transform(mach, X_valid);
begin
println("MAE (Imputation):")
println(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
end
Part A
begin
final_imputer = FillImputer()
impute_mach = machine(final_imputer, X_train) |> fit!
final_X_train = MLJ.transform(impute_mach, X_train)
final_X_valid = MLJ.transform(impute_mach, X_valid)
end;
begin
Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
forest = Forest(n_trees=100, rng=0)
final_mach = machine(forest, final_X_train, y_train, scitype_check_level=0) |> fit!
# Get validation predictions and MAE
preds_valid = predict(final_mach, final_X_valid)
println("MAE (Your approach):")
println(mean_absolute_error(preds_valid, y_valid))
end
Part B
final_X_test = MLJ.transform(impute_mach, X_test)
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | ... | |
---|---|---|---|---|---|---|---|---|---|
1 | 1461 | 20 | 80 | 11622 | 5 | 6 | 1961 | 1961 | |
2 | 1462 | 20 | 81 | 14267 | 6 | 6 | 1958 | 1958 | |
3 | 1463 | 60 | 74 | 13830 | 5 | 5 | 1997 | 1998 | |
4 | 1464 | 60 | 78 | 9978 | 6 | 6 | 1998 | 1998 | |
5 | 1465 | 120 | 43 | 5005 | 8 | 5 | 1992 | 1992 | |
6 | 1466 | 60 | 75 | 10000 | 6 | 5 | 1993 | 1994 | |
7 | 1467 | 20 | 70 | 7980 | 6 | 7 | 1992 | 2007 | |
8 | 1468 | 60 | 63 | 8402 | 6 | 5 | 1998 | 1998 | |
9 | 1469 | 20 | 85 | 10176 | 7 | 5 | 1990 | 1990 | |
10 | 1470 | 20 | 70 | 8400 | 4 | 5 | 1970 | 1970 | |
... | |||||||||
1459 | 2919 | 60 | 74 | 9627 | 7 | 5 | 1993 | 1994 |
# Fill in the line below: get test predictions
preds_test = predict(final_mach, final_X_test)
1459-element Vector{Float64}: 129529.76 150821.75 181431.32 183204.75 186289.32 189813.35 175749.72 ⋮ 82270.52 81732.5 87298.61 167994.5 119096.07 242675.13
begin
output = DataFrame("Id" => X_test.Id,
"SalePrice" => preds_test)
CSV.write("data//home-data-for-ml-course//submissions_02x02.csv", output)
end
"data//home-data-for-ml-course//submissions_02x02.csv"
Built with Julia 1.9.1 and
CSV 0.10.11DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0
To run this tutorial locally, download [this file](/tutorials/missingvalues02x02.jl) and open it with Pluto.jl.