using DataFrames, CSV, MLJ, MLJDecisionTreeInterface
X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);
X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);
describe(X_full);
#subset!(X_full, :SalePrice => ByRow(!=("missing"))) 
# filter SalePrice by row in place
dropmissing!(X_full, Cols(:SalePrice));
y = X_full.SalePrice;

Fixing data type

convert String type to Int type and missing

X_full.LotFrontage = something.(tryparse.(Int, X_full.LotFrontage), missing);
X_full.MasVnrArea = something.(tryparse.(Int, X_full.MasVnrArea), missing);
X_full.GarageYrBlt = something.(tryparse.(Int, X_full.GarageYrBlt), missing);
select!(X_full, Not(:SalePrice)); # drop column SalePrice in Place
size(X_full)
(1460, 80)
X = select(X_full, Cols(x -> eltype(X_full[:, x]) <: Union{Number,Missing})); # 
X_full
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShape...
1160"RL"658450"Pave""NA""Reg"
2220"RL"809600"Pave""NA""Reg"
3360"RL"6811250"Pave""NA""IR1"
4470"RL"609550"Pave""NA""IR1"
5560"RL"8414260"Pave""NA""IR1"
6650"RL"8514115"Pave""NA""IR1"
7720"RL"7510084"Pave""NA""Reg"
8860"RL"missing10382"Pave""NA""IR1"
9950"RM"516120"Pave""NA""Reg"
1010190"RL"507420"Pave""NA""Reg"
...
1460146020"RL"759937"Pave""NA""Reg"
size(X)
(1460, 37)
begin
    X_test_full.LotFrontage = something.(tryparse.(Int, X_test_full.LotFrontage), missing);
    X_test_full.MasVnrArea = something.(tryparse.(Int, X_test_full.MasVnrArea), missing);
    X_test_full.GarageYrBlt = something.(tryparse.(Int, X_test_full.GarageYrBlt), missing);
    X_test_full.BsmtFinSF1 = something.(tryparse.(Int, X_test_full.BsmtFinSF1), missing);
    X_test_full.BsmtFinSF2 = something.(tryparse.(Int, X_test_full.BsmtFinSF2), missing);
    X_test_full.BsmtUnfSF = something.(tryparse.(Int, X_test_full.BsmtUnfSF), missing);
    X_test_full.TotalBsmtSF = something.(tryparse.(Int, X_test_full.TotalBsmtSF), missing);
    X_test_full.GarageCars = something.(tryparse.(Int, X_test_full.GarageCars), missing);
    X_test_full.GarageArea = something.(tryparse.(Int, X_test_full.GarageArea), missing);
    X_test_full.BsmtFullBath = something.(tryparse.(Int, X_test_full.BsmtFullBath), missing);
    X_test_full.BsmtHalfBath = something.(tryparse.(Int, X_test_full.BsmtHalfBath), missing);
end
1459-element Vector{Union{Missing, Int64}}:
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 1
 0
X_test_full
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShape...
1146120"RH"8011622"Pave""NA""Reg"
2146220"RL"8114267"Pave""NA""IR1"
3146360"RL"7413830"Pave""NA""IR1"
4146460"RL"789978"Pave""NA""IR1"
51465120"RL"435005"Pave""NA""IR1"
6146660"RL"7510000"Pave""NA""IR1"
7146720"RL"missing7980"Pave""NA""IR1"
8146860"RL"638402"Pave""NA""IR1"
9146920"RL"8510176"Pave""NA""Reg"
10147020"RL"708400"Pave""NA""Reg"
...
1459291960"RL"749627"Pave""NA""Reg"
size(X_test_full)
(1459, 80)
X_test = select(X_test_full, Cols(x -> eltype(X_test_full[:, x]) <: Union{Number, Missing}));
size(X_test)
(1459, 37)
(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true, rng=0);

Preliminary Investigation

(1168, 37)
describe(X_train, :nmissing)
variablenmissing
1:Id0
2:MSSubClass0
3:LotFrontage212
4:LotArea0
5:OverallQual0
6:OverallCond0
7:YearBuilt0
8:YearRemodAdd0
9:MasVnrArea6
10:BsmtFinSF10
...
37:YrSold0
# Function for comparing different approaches
function score_dataset(X_train, X_valid, y_train, y_valid)
    Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
    model = Forest(n_trees=100, rng=0)
    mach = machine(model, X_train, y_train, scitype_check_level=0)
    fit!(mach, verbosity=0)
    preds = predict(mach, X_valid)
    return mean_absolute_error(preds, y_valid)
end
score_dataset (generic function with 1 method)
cols_with_missing = names(X_train, any.(ismissing, eachcol(X_train))) # pick columns that contain missing values
3-element Vector{String}:
 "LotFrontage"
 "MasVnrArea"
 "GarageYrBlt"
reduced_X_train = select(X_train, Not(cols_with_missing));
reduced_X_valid = select(X_valid, Not(cols_with_missing));
begin
    println("MAE (Drop columns with missing values):")
    println(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
end
my_imputer = FillImputer()
FillImputer(
  features = Symbol[], 
  continuous_fill = MLJModels._median, 
  count_fill = MLJModels._round_median, 
  finite_fill = MLJModels._mode)
mach = machine(my_imputer, X_train) |> fit!
trained Machine; caches model-specific representations of data
  model: FillImputer(features = Symbol[], …)
  args: 
    1:	Source @464 ⏎ ScientificTypesBase.Table{Union{AbstractVector{Union{Missing, ScientificTypesBase.Count}}, AbstractVector{ScientificTypesBase.Count}}}
imputed_X_train = MLJ.transform(mach, X_train);
imputed_X_valid = MLJ.transform(mach, X_valid);
begin
    println("MAE (Imputation):")
    println(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
end

Part A

begin
    final_imputer = FillImputer()
    impute_mach = machine(final_imputer, X_train) |> fit!
    final_X_train = MLJ.transform(impute_mach, X_train)
    final_X_valid = MLJ.transform(impute_mach, X_valid)
end;
begin
    Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
    forest = Forest(n_trees=100, rng=0)
    final_mach = machine(forest, final_X_train, y_train, scitype_check_level=0) |> fit!
    # Get validation predictions and MAE
    preds_valid = predict(final_mach, final_X_valid)
    println("MAE (Your approach):")
    println(mean_absolute_error(preds_valid, y_valid))
end

Part B

final_X_test = MLJ.transform(impute_mach, X_test)
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAdd...
114612080116225619611961
214622081142676619581958
314636074138305519971998
41464607899786619981998
514651204350058519921992
614666075100006519931994
71467207079806719922007
81468606384026519981998
914692085101767519901990
101470207084004519701970
...
14592919607496277519931994
# Fill in the line below: get test predictions
preds_test = predict(final_mach, final_X_test)
1459-element Vector{Float64}:
 129529.76
 150821.75
 181431.32
 183204.75
 186289.32
 189813.35
 175749.72
      ⋮
  82270.52
  81732.5
  87298.61
 167994.5
 119096.07
 242675.13
begin
    output = DataFrame("Id" => X_test.Id,
                           "SalePrice" => preds_test)
    CSV.write("data//home-data-for-ml-course//submissions_02x02.csv", output)
end
"data//home-data-for-ml-course//submissions_02x02.csv"

Built with Julia 1.9.1 and

CSV 0.10.11
DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/missingvalues02x02.jl) and open it with Pluto.jl.