- plutoTemplate

using DataFrames, CSV, MLJ, MLJDecisionTreeInterface

X_full = CSV.read("data//home-data-for-ml-course//train.csv", DataFrame);

X_test_full = CSV.read("data//home-data-for-ml-course//test.csv", DataFrame);

describe(X_full);

#subset!(X_full, :SalePrice => ByRow(!=("missing"))) 
# filter SalePrice by row in place

dropmissing!(X_full, Cols(:SalePrice));

y = X_full.SalePrice;

Fixing data type

convert String type to Int type and missing

X_full.LotFrontage = something.(tryparse.(Int, X_full.LotFrontage), missing);

X_full.MasVnrArea = something.(tryparse.(Int, X_full.MasVnrArea), missing);

X_full.GarageYrBlt = something.(tryparse.(Int, X_full.GarageYrBlt), missing);

select!(X_full, Not(:SalePrice)); # drop column SalePrice in Place

size(X_full)

(1460, 80)

X = select(X_full, Cols(x -> eltype(X_full[:, x]) <: Union{Number,Missing})); #

X_full

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape
1	1	60	"RL"	65	8450	"Pave"	"NA"	"Reg"
2	2	20	"RL"	80	9600	"Pave"	"NA"	"Reg"
3	3	60	"RL"	68	11250	"Pave"	"NA"	"IR1"
4	4	70	"RL"	60	9550	"Pave"	"NA"	"IR1"
5	5	60	"RL"	84	14260	"Pave"	"NA"	"IR1"
6	6	50	"RL"	85	14115	"Pave"	"NA"	"IR1"
7	7	20	"RL"	75	10084	"Pave"	"NA"	"Reg"
8	8	60	"RL"	missing	10382	"Pave"	"NA"	"IR1"
9	9	50	"RM"	51	6120	"Pave"	"NA"	"Reg"
10	10	190	"RL"	50	7420	"Pave"	"NA"	"Reg"
...
1460	1460	20	"RL"	75	9937	"Pave"	"NA"	"Reg"

size(X)

(1460, 37)

begin
    X_test_full.LotFrontage = something.(tryparse.(Int, X_test_full.LotFrontage), missing);
    X_test_full.MasVnrArea = something.(tryparse.(Int, X_test_full.MasVnrArea), missing);
    X_test_full.GarageYrBlt = something.(tryparse.(Int, X_test_full.GarageYrBlt), missing);
    X_test_full.BsmtFinSF1 = something.(tryparse.(Int, X_test_full.BsmtFinSF1), missing);
    X_test_full.BsmtFinSF2 = something.(tryparse.(Int, X_test_full.BsmtFinSF2), missing);
    X_test_full.BsmtUnfSF = something.(tryparse.(Int, X_test_full.BsmtUnfSF), missing);
    X_test_full.TotalBsmtSF = something.(tryparse.(Int, X_test_full.TotalBsmtSF), missing);
    X_test_full.GarageCars = something.(tryparse.(Int, X_test_full.GarageCars), missing);
    X_test_full.GarageArea = something.(tryparse.(Int, X_test_full.GarageArea), missing);
    X_test_full.BsmtFullBath = something.(tryparse.(Int, X_test_full.BsmtFullBath), missing);
    X_test_full.BsmtHalfBath = something.(tryparse.(Int, X_test_full.BsmtHalfBath), missing);
end

1459-element Vector{Union{Missing, Int64}}:
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 1
 0

X_test_full

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape
1	1461	20	"RH"	80	11622	"Pave"	"NA"	"Reg"
2	1462	20	"RL"	81	14267	"Pave"	"NA"	"IR1"
3	1463	60	"RL"	74	13830	"Pave"	"NA"	"IR1"
4	1464	60	"RL"	78	9978	"Pave"	"NA"	"IR1"
5	1465	120	"RL"	43	5005	"Pave"	"NA"	"IR1"
6	1466	60	"RL"	75	10000	"Pave"	"NA"	"IR1"
7	1467	20	"RL"	missing	7980	"Pave"	"NA"	"IR1"
8	1468	60	"RL"	63	8402	"Pave"	"NA"	"IR1"
9	1469	20	"RL"	85	10176	"Pave"	"NA"	"Reg"
10	1470	20	"RL"	70	8400	"Pave"	"NA"	"Reg"
...
1459	2919	60	"RL"	74	9627	"Pave"	"NA"	"Reg"

size(X_test_full)

(1459, 80)

X_test = select(X_test_full, Cols(x -> eltype(X_test_full[:, x]) <: Union{Number, Missing}));

size(X_test)

(1459, 37)

(X_train, X_valid), (y_train, y_valid) = partition((X, y), 0.8, multi=true, rng=0);

Preliminary Investigation

(1168, 37)

describe(X_train, :nmissing)

	variable	nmissing
1	:Id	0
2	:MSSubClass	0
3	:LotFrontage	212
4	:LotArea	0
5	:OverallQual	0
6	:OverallCond	0
7	:YearBuilt	0
8	:YearRemodAdd	0
9	:MasVnrArea	6
10	:BsmtFinSF1	0
...
37	:YrSold	0

# Function for comparing different approaches
function score_dataset(X_train, X_valid, y_train, y_valid)
    Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
    model = Forest(n_trees=100, rng=0)
    mach = machine(model, X_train, y_train, scitype_check_level=0)
    fit!(mach, verbosity=0)
    preds = predict(mach, X_valid)
    return mean_absolute_error(preds, y_valid)
end

score_dataset (generic function with 1 method)

cols_with_missing = names(X_train, any.(ismissing, eachcol(X_train))) # pick columns that contain missing values

3-element Vector{String}:
 "LotFrontage"
 "MasVnrArea"
 "GarageYrBlt"

reduced_X_train = select(X_train, Not(cols_with_missing));

reduced_X_valid = select(X_valid, Not(cols_with_missing));

begin
    println("MAE (Drop columns with missing values):")
    println(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
end

my_imputer = FillImputer()

FillImputer(
  features = Symbol[], 
  continuous_fill = MLJModels._median, 
  count_fill = MLJModels._round_median, 
  finite_fill = MLJModels._mode)

mach = machine(my_imputer, X_train) |> fit!

trained Machine; caches model-specific representations of data
  model: FillImputer(features = Symbol[], …)
  args: 
    1:	Source @464 ⏎ ScientificTypesBase.Table{Union{AbstractVector{Union{Missing, ScientificTypesBase.Count}}, AbstractVector{ScientificTypesBase.Count}}}

imputed_X_train = MLJ.transform(mach, X_train);

imputed_X_valid = MLJ.transform(mach, X_valid);

begin
    println("MAE (Imputation):")
    println(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))
end

Part A

begin
    final_imputer = FillImputer()
    impute_mach = machine(final_imputer, X_train) |> fit!
    final_X_train = MLJ.transform(impute_mach, X_train)
    final_X_valid = MLJ.transform(impute_mach, X_valid)
end;

begin
    Forest = @load RandomForestRegressor pkg=DecisionTree verbosity=0
    forest = Forest(n_trees=100, rng=0)
    final_mach = machine(forest, final_X_train, y_train, scitype_check_level=0) |> fit!
    # Get validation predictions and MAE
    preds_valid = predict(final_mach, final_X_valid)
    println("MAE (Your approach):")
    println(mean_absolute_error(preds_valid, y_valid))
end

Part B

final_X_test = MLJ.transform(impute_mach, X_test)

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd
1	1461	20	80	11622	5	6	1961	1961
2	1462	20	81	14267	6	6	1958	1958
3	1463	60	74	13830	5	5	1997	1998
4	1464	60	78	9978	6	6	1998	1998
5	1465	120	43	5005	8	5	1992	1992
6	1466	60	75	10000	6	5	1993	1994
7	1467	20	70	7980	6	7	1992	2007
8	1468	60	63	8402	6	5	1998	1998
9	1469	20	85	10176	7	5	1990	1990
10	1470	20	70	8400	4	5	1970	1970
...
1459	2919	60	74	9627	7	5	1993	1994

# Fill in the line below: get test predictions
preds_test = predict(final_mach, final_X_test)

1459-element Vector{Float64}:
 129529.76
 150821.75
 181431.32
 183204.75
 186289.32
 189813.35
 175749.72
      ⋮
  82270.52
  81732.5
  87298.61
 167994.5
 119096.07
 242675.13

begin
    output = DataFrame("Id" => X_test.Id,
                           "SalePrice" => preds_test)
    CSV.write("data//home-data-for-ml-course//submissions_02x02.csv", output)
end

"data//home-data-for-ml-course//submissions_02x02.csv"

Built with Julia 1.9.1 and

CSV 0.10.11
DataFrames 1.5.0
MLJ 0.19.2
MLJDecisionTreeInterface 0.4.0

To run this tutorial locally, download [this file](/tutorials/missingvalues02x02.jl) and open it with Pluto.jl.

Julia Tutorials Template

Fixing data type

convert String type to Int type and missing

Preliminary Investigation

Part A

Part B