import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
Blubook Bulldozer
Shows the usage of aiking library on a kaggle dataset
Import public packages
Import private packages
= 'kaggle_secrets' in sys.modules is_kaggle
if is_kaggle:
from kaggle_secrets import UserSecretsClient
= UserSecretsClient()
user_secrets 'KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
os.environ[if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
'KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
os.environ[= user_secrets.get_secret("GITHUB_PAT")
github_pat !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
from aiking.data.external import *
= untar_data("kaggle_competitions::bluebook-bulldozer-remix");
path print(path.ls())
[Path('/AIKING_HOME/data/bluebook-bulldozer-remix/bluebook-bulldozer-remix.zip'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Data Dictionary.xlsx'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Train'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/TrainingData'), Path('/AIKING_HOME/data/bluebook-bulldozer-remix/Valid')]
from aiking.ml.structured import *
Read the Dataset
= pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input"));
data_dir = data_dir/"bluebook-bulldozer-remix"
path path.ls()
(#5) [Path('/kaggle/input/bluebook-bulldozer-remix/bluebook-bulldozer-remix.zip'),Path('/kaggle/input/bluebook-bulldozer-remix/Data Dictionary.xlsx'),Path('/kaggle/input/bluebook-bulldozer-remix/Train'),Path('/kaggle/input/bluebook-bulldozer-remix/TrainingData'),Path('/kaggle/input/bluebook-bulldozer-remix/Valid')]
= pd.read_csv(path/"Train/Train.csv", low_memory=False,parse_dates=['saledate'], infer_datetime_format=True); df_train.head()
df_train = pd.read_csv(path/"Valid/Valid.csv", parse_dates=['saledate'], infer_datetime_format=True); df_test.head() df_test
SalesID | MachineID | ModelID | datasource | auctioneerID | YearMade | MachineHoursCurrentMeter | UsageBand | saledate | fiModelDesc | ... | Undercarriage_Pad_Width | Stick_Length | Thumb | Pattern_Changer | Grouser_Type | Backhoe_Mounting | Blade_Type | Travel_Controls | Differential_Type | Steering_Controls | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1222837 | 902859 | 1376 | 121 | 3 | 1000 | 0.0 | NaN | 2012-01-05 | 375L | ... | None or Unspecified | None or Unspecified | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
1 | 1222839 | 1048320 | 36526 | 121 | 3 | 2006 | 4412.0 | Medium | 2012-01-05 | TX300LC2 | ... | None or Unspecified | 12' 4" | None or Unspecified | Yes | Double | NaN | NaN | NaN | NaN | NaN |
2 | 1222841 | 999308 | 4587 | 121 | 3 | 2000 | 10127.0 | Medium | 2012-01-05 | 270LC | ... | None or Unspecified | 12' 4" | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
3 | 1222843 | 1062425 | 1954 | 121 | 3 | 1000 | 4682.0 | Low | 2012-01-05 | 892DLC | ... | None or Unspecified | None or Unspecified | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
4 | 1222845 | 1032841 | 4701 | 121 | 3 | 2002 | 8150.0 | Medium | 2012-01-04 | 544H | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Standard | Conventional |
5 rows × 52 columns
Modelling
Define categories ordering
= 0
max_n_cat = get_cat_dict(df_train, max_n_cat=max_n_cat)
cat_dict display_cat(cat_dict)
0 | 1 | |
---|---|---|
0 | UsageBand | [High, Low, Medium] |
1 | fiModelDesc | [100C, 104, 1066, 1066E, 1080, 1080B, 1088, 1088CK, 1088LT, 1088TTL, 10B, 10C, 10DG, 110, 1105, 110S, 110TLB, 110Z, 110Z-2, 112, 112E, 112F, 115, 1150, 1150B, 1150BLGP, 1150C, 1150D, 1150E, 1150ELT, 1150G, 1150GLGP, 1150GLT, 1150H, 1150HLGP, 1150HLT, 1150HWT, 1150K, 1150KLGPSERIES3, 1150KWT, 1150KXLT, 1150KXLTIII, 115SRDZ, 115Z, 115ZIII, 115ZIV, 115ZIV-2, 115ZV, 116, 1166, 118, 1187C, 1188, 1188LC, 1188P, 118B, 118C, 11B, 11C, 12, 120, 120B, 120C, 120CLC, 120D, 120E, 120G, 120H, 120HNA, 120LC, 120M , 125, 125A, 125B, 125C, 125CKBNA, 1280, 1280B, 1288, 12E, 12F, 12G, 12H, 12HNA, 12JX, 130, 1300, 1300D, 130G, 130LC, 130LX, 1340XL, 135, 135C, 135CRTS, 135H, 135HNA, 135MSR SPIN ACE, 135SR, 135SRLC, ...] |
2 | fiBaseModel | [10, 100, 104, 1066, 1080, 1088, 10DG, 11, 110, 1105, 112, 115, 1150, 116, 1166, 118, 1187, 1188, 12, 120, 125, 1280, 1288, 130, 1300, 1340, 135, 137, 14, 140, 1400, 143, 1450, 15, 150, 1500, 153, 155, 1550, 16, 160, 1600, 163, 165, 1650, 166, 17, 170, 1700, 1737, 1740, 175, 1750, 1760XL, 17ZTS, 18, 1800, 1818, 1825, 1830, 1835, 1838, 1840, 1845, 185, 1850, 190, 1900, 198, 20, 200, 2000, 2022, 2026, 2040, 2042, 2044, 205, 2050, 2054, 2060, 2060XL, 2064, 2066, 2070, 2074, 2076, 208, 2086, 2095, 2099, 21, 210, 2105, 2109, 211, 212, 213, 213LC, 214, ...] |
3 | fiSecondaryDesc | [ MSR SPIN ACE, #NAME?, -2, -3, -5, -5L, -6, -7, 0.7, 1, 2, 3, 5, 7, A, AA, AB, AG, AW, AX, B, B , BEC, BL, BLGP, BLGPPS, BZ, BZD, C, C , CE, CH, CK, CKB, CL, CLR, CM, CR, CS, CX, D, DC, DL, DT, DX, DXT, DZ, E, EG, EL, ESS, EST, EW, EX, F, FR, G, GT, H, H , H90, HAG, HD, HDS, HDSL, HF, HL, HLGP, HLS, HX, HZ, IV, J, JX, K, L, LC, LC7A, LC7LR, LCD, LCH, LCLR, LCM, LD, LE, LGP, LR, LS, LT, LX, M, M , MC, MR, MRX, MSR, MSR SPIN ACE, MT, MU, MXT, ...] |
4 | fiModelSeries | [ III, #NAME?, -1, -1.50E+01, -11, -12, -15, -16, -17, -18, -1B, -1C, -1L, -2, -20, -21, -21A, -2A, -2C, -2E, -2LC, -2N, -3, -3C, -3EO, -3H, -3L, -3LC, -3LK, -3M, -3MC, -3PT, -4, -5, -5A, -5E, -5F, -5H, -5L, -5LC, -6, -6A, -6B, -6C, -6E, -6K, -6LC, -6LK, -7, -7B, -7E, -7K, -8, -8E, 1, 12, 14FT, 15, 16, 17, 18, 1970, 2, 20, 21KomStat, 21KomStatII, 22, 2B, 2T, 3, 3A, 3C, 4, 5, 5N, 6, 6.00E+00, 6F, 6L, 6LE, 6LK, 7, 7.00E+00, 7A, 7L, 8, A, AWS, D, D7, E, EX, Elite, FASTRRACK, G, GALEO, H, II, III, IV, ...] |
5 | fiModelDescriptor | [ 14FT, LGP, SUPER, XLT, XT, ZX, (BLADE RUNNER), 1, 2, 2.00E+00, 2N, 3, 3.00E+00, 3C, 3L, 3NO, 4WD, 4x4x4, 5, 6, 6K, 7, 7.00E+00, 7A, 8, A, AE0, AVANCE, B, BE, C, CK, CR, CRSB, CUSTOM, DA, DELUXE, DHP, DINGO, DLL, DT, DW, E, ESL, G, GALEO, H, H5, HD, HF, HSD, HT, High Lift, HighLift, II, III, IT, IV, K, K3, K5, KA, KBNA, L, LC, LC8, LCH, LCR, LCRTS, LE, LGP, LGPVP, LITRONIC, LK, LL, LM, LN, LR, LRC, LRR, LS, LT, LU, LX, LongReach, M, MC, ME, MH, N, NLC, NSUC, P, PLUS, PRO, RR, RTS, S, SA, SB, ...] |
6 | ProductSize | [Compact, Large, Large / Medium, Medium, Mini, Small] |
7 | fiProductClassDesc | [Backhoe Loader - 0.0 to 14.0 Ft Standard Digging Depth, Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth, Backhoe Loader - 15.0 to 16.0 Ft Standard Digging Depth, Backhoe Loader - 16.0 + Ft Standard Digging Depth, Backhoe Loader - Unidentified, Hydraulic Excavator, Track - 0.0 to 2.0 Metric Tons, Hydraulic Excavator, Track - 11.0 to 12.0 Metric Tons, Hydraulic Excavator, Track - 12.0 to 14.0 Metric Tons, Hydraulic Excavator, Track - 14.0 to 16.0 Metric Tons, Hydraulic Excavator, Track - 150.0 to 300.0 Metric Tons, Hydraulic Excavator, Track - 16.0 to 19.0 Metric Tons, Hydraulic Excavator, Track - 19.0 to 21.0 Metric Tons, Hydraulic Excavator, Track - 2.0 to 3.0 Metric Tons, Hydraulic Excavator, Track - 21.0 to 24.0 Metric Tons, Hydraulic Excavator, Track - 24.0 to 28.0 Metric Tons, Hydraulic Excavator, Track - 28.0 to 33.0 Metric Tons, Hydraulic Excavator, Track - 3.0 to 4.0 Metric Tons, Hydraulic Excavator, Track - 300.0 + Metric Tons, Hydraulic Excavator, Track - 33.0 to 40.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 5.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 40.0 to 50.0 Metric Tons, Hydraulic Excavator, Track - 5.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 50.0 to 66.0 Metric Tons, Hydraulic Excavator, Track - 6.0 to 8.0 Metric Tons, Hydraulic Excavator, Track - 66.0 to 90.0 Metric Tons, Hydraulic Excavator, Track - 8.0 to 11.0 Metric Tons, Hydraulic Excavator, Track - 90.0 to 150.0 Metric Tons, Hydraulic Excavator, Track - Unidentified, Hydraulic Excavator, Track - Unidentified (Compact Construction), Motorgrader - 130.0 to 145.0 Horsepower, Motorgrader - 145.0 to 170.0 Horsepower, Motorgrader - 170.0 to 200.0 Horsepower, Motorgrader - 200.0 + Horsepower, Motorgrader - 45.0 to 130.0 Horsepower, Motorgrader - Unidentified, Skid Steer Loader - 0.0 to 701.0 Lb Operating Capacity, Skid Steer Loader - 1251.0 to 1351.0 Lb Operating Capacity, Skid Steer Loader - 1351.0 to 1601.0 Lb Operating Capacity, Skid Steer Loader - 1601.0 to 1751.0 Lb Operating Capacity, Skid Steer Loader - 1751.0 to 2201.0 Lb Operating Capacity, Skid Steer Loader - 2201.0 to 2701.0 Lb Operating Capacity, Skid Steer Loader - 2701.0+ Lb Operating Capacity, Skid Steer Loader - 701.0 to 976.0 Lb Operating Capacity, Skid Steer Loader - 976.0 to 1251.0 Lb Operating Capacity, Skid Steer Loader - Unidentified, Track Type Tractor, Dozer - 105.0 to 130.0 Horsepower, Track Type Tractor, Dozer - 130.0 to 160.0 Horsepower, Track Type Tractor, Dozer - 160.0 to 190.0 Horsepower, Track Type Tractor, Dozer - 190.0 to 260.0 Horsepower, Track Type Tractor, Dozer - 20.0 to 75.0 Horsepower, Track Type Tractor, Dozer - 260.0 + Horsepower, Track Type Tractor, Dozer - 75.0 to 85.0 Horsepower, Track Type Tractor, Dozer - 85.0 to 105.0 Horsepower, Track Type Tractor, Dozer - Unidentified, Wheel Loader - 0.0 to 40.0 Horsepower, Wheel Loader - 100.0 to 110.0 Horsepower, Wheel Loader - 1000.0 + Horsepower, Wheel Loader - 110.0 to 120.0 Horsepower, Wheel Loader - 120.0 to 135.0 Horsepower, Wheel Loader - 135.0 to 150.0 Horsepower, Wheel Loader - 150.0 to 175.0 Horsepower, Wheel Loader - 175.0 to 200.0 Horsepower, Wheel Loader - 200.0 to 225.0 Horsepower, Wheel Loader - 225.0 to 250.0 Horsepower, Wheel Loader - 250.0 to 275.0 Horsepower, Wheel Loader - 275.0 to 350.0 Horsepower, Wheel Loader - 350.0 to 500.0 Horsepower, Wheel Loader - 40.0 to 60.0 Horsepower, Wheel Loader - 500.0 to 1000.0 Horsepower, Wheel Loader - 60.0 to 80.0 Horsepower, Wheel Loader - 80.0 to 90.0 Horsepower, Wheel Loader - 90.0 to 100.0 Horsepower, Wheel Loader - Unidentified] |
8 | state | [Alabama, Alaska, Arizona, Arkansas, California, Colorado, Connecticut, Delaware, Florida, Georgia, Hawaii, Idaho, Illinois, Indiana, Iowa, Kansas, Kentucky, Louisiana, Maine, Maryland, Massachusetts, Michigan, Minnesota, Mississippi, Missouri, Montana, Nebraska, Nevada, New Hampshire, New Jersey, New Mexico, New York, North Carolina, North Dakota, Ohio, Oklahoma, Oregon, Pennsylvania, Puerto Rico, Rhode Island, South Carolina, South Dakota, Tennessee, Texas, Unspecified, Utah, Vermont, Virginia, Washington, Washington DC, West Virginia, Wisconsin, Wyoming] |
9 | ProductGroup | [BL, MG, SSL, TEX, TTT, WL] |
10 | ProductGroupDesc | [Backhoe Loaders, Motor Graders, Skid Steer Loaders, Track Excavators, Track Type Tractors, Wheel Loader] |
11 | Drive_System | [All Wheel Drive, Four Wheel Drive, No, Two Wheel Drive] |
12 | Enclosure | [EROPS, EROPS AC, EROPS w AC, NO ROPS, None or Unspecified, OROPS] |
13 | Forks | [None or Unspecified, Yes] |
14 | Pad_Type | [Grouser, None or Unspecified, Reversible, Street] |
15 | Ride_Control | [No, None or Unspecified, Yes] |
16 | Stick | [Extended, Standard] |
17 | Transmission | [AutoShift, Autoshift, Direct Drive, Hydrostatic, None or Unspecified, Powershift, Powershuttle, Standard] |
18 | Turbocharged | [None or Unspecified, Yes] |
19 | Blade_Extension | [None or Unspecified, Yes] |
20 | Blade_Width | [12', 13', 14', 16', <12', None or Unspecified] |
21 | Enclosure_Type | [High Profile, Low Profile, None or Unspecified] |
22 | Engine_Horsepower | [No, Variable] |
23 | Hydraulics | [2 Valve, 3 Valve, 4 Valve, Auxiliary, Base + 1 Function, Base + 2 Function, Base + 3 Function, Base + 4 Function, Base + 5 Function, Base + 6 Function, None or Unspecified, Standard] |
24 | Pushblock | [None or Unspecified, Yes] |
25 | Ripper | [Multi Shank, None or Unspecified, Single Shank, Yes] |
26 | Scarifier | [None or Unspecified, Yes] |
27 | Tip_Control | [None or Unspecified, Sideshift & Tip, Tip] |
28 | Tire_Size | [10 inch, 10", 13", 14", 15.5, 15.5", 17.5, 17.5", 20.5, 20.5", 23.1", 23.5, 23.5", 26.5, 29.5, 7.0", None or Unspecified] |
29 | Coupler | [Hydraulic, Manual, None or Unspecified] |
30 | Coupler_System | [None or Unspecified, Yes] |
31 | Grouser_Tracks | [None or Unspecified, Yes] |
32 | Hydraulics_Flow | [High Flow, None or Unspecified, Standard] |
33 | Track_Type | [Rubber, Steel] |
34 | Undercarriage_Pad_Width | [14 inch, 15 inch, 16 inch, 18 inch, 20 inch, 22 inch, 24 inch, 25 inch, 26 inch, 27 inch, 28 inch, 30 inch, 31 inch, 31.5 inch, 32 inch, 33 inch, 34 inch, 36 inch, None or Unspecified] |
35 | Stick_Length | [10' 10", 10' 2", 10' 6", 11' 0", 11' 10", 12' 10", 12' 4", 12' 8", 13' 10", 13' 7", 13' 9", 14' 1", 15' 4", 15' 9", 19' 8", 24' 3", 6' 3", 7' 10", 8' 10", 8' 2", 8' 4", 8' 6", 9' 10", 9' 2", 9' 5", 9' 6", 9' 7", 9' 8", None or Unspecified] |
36 | Thumb | [Hydraulic, Manual, None or Unspecified] |
37 | Pattern_Changer | [No, None or Unspecified, Yes] |
38 | Grouser_Type | [Double, Single, Triple] |
39 | Backhoe_Mounting | [None or Unspecified, Yes] |
40 | Blade_Type | [Angle, Coal, Landfill, No, None or Unspecified, PAT, Semi U, Straight, U, VPAT] |
41 | Travel_Controls | [1 Speed, 2 Pedal, Differential Steer, Finger Tip, Lever, None or Unspecified, Pedal] |
42 | Differential_Type | [Limited Slip, Locking, No Spin, Standard] |
43 | Steering_Controls | [Command Control, Conventional, Four Wheel Standard, No, Wheel] |
= {
updates 'UsageBand':['High', 'Medium', 'Low'],
'Blade_Width':[ "16'", "14'", "13'","12'", "<12'", "None or Unspecified"],
'Grouser_Type':['Triple', 'Double', 'Single'],
'ProductSize':['Large','Large / Medium', 'Medium', 'Compact', 'Small', 'Mini']
}
cat_dict.update(updates)= {k:v for k,v in cat_dict.items() if k in updates}
filter_dict display_cat(filter_dict)
0 | 1 | |
---|---|---|
0 | UsageBand | [High, Medium, Low] |
1 | ProductSize | [Large, Large / Medium, Medium, Compact, Small, Mini] |
2 | Blade_Width | [16', 14', 13', 12', <12', None or Unspecified] |
3 | Grouser_Type | [Triple, Double, Single] |
Define Validation Set
Code
def range_data(df_train, df_test, names=['Train', 'Test'], datecol='saledate'):
return pd.DataFrame([
'Name':names[0], 'Start':df_train[datecol].min(), 'End': df_train[datecol].max(), 'Interval':df_train[datecol].max() -df_train[datecol].min(), 'Size': len(df_train) },
{'Name':names[1], 'Start':df_test[datecol].min(), 'End': df_test[datecol].max(), 'Interval':df_test[datecol].max() -df_test[datecol].min(), 'Size': len(df_test) }]).set_index('Name') {
range_data(df_train, df_test)
Start | End | Interval | Size | |
---|---|---|---|---|
Name | ||||
Train | 1989-01-17 | 2011-12-30 | 8382 days | 401125 |
Test | 2012-01-01 | 2012-04-28 | 118 days | 11573 |
Test data starts at the end of train date with about 4 months of data (covering February with 28/29 days). We need to split our internal validation set in a similar way. We will take 4 months / 120 days of data as validation set
= (df_train[['saledate']].max() - pd.Timedelta(value=120, unit='D')).values[0] # Last 4 months data for internal validation
validation_date_start = df_train[df_train['saledate'] < validation_date_start], df_train[df_train['saledate']>= validation_date_start] df_model, df_valid
=['Model', 'Valid']) range_data(df_model, df_valid, names
Start | End | Interval | Size | |
---|---|---|---|---|
Name | ||||
Model | 1989-01-17 | 2011-08-31 | 8261 days | 390213 |
Valid | 2011-09-01 | 2011-12-30 | 120 days | 10912 |
Define Pipeline
def get_model_pipeline(max_n_cat=0,
=None,
cat_dict={'class': StandardScaler},
scale_dict={'class':NumericalEncoder,'categories':None},
cat_num_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
cat_dummy_dict={'class':SimpleImputer, 'strategy':'median'},
imputer_dict
):= (get_default_feature_def,
layer_spec_default
{'skip_flds':None,
'ignored_flds':None,
'max_n_cat':max_n_cat,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':False,
'scale_dict':scale_dict,
'cat_num_dict':cat_num_dict,
'cat_dummy_dict':cat_dummy_dict,
'imputer_dict':imputer_dict,
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':cat_dict
}
)
= [layer_spec_default]
layer_specs = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
proc = RandomForestRegressor(n_jobs=-1)
model = make_pipeline(proc, model); pipeline
pipeline return pipeline
= get_model_pipeline(cat_dict); pipeline pipeline
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
('randomforestregressor', RandomForestRegressor(n_jobs=-1))])
Train on Partial Data
= 0
max_n_cat
def get_xy(df, col='SalePrice'): return df.drop([col], axis=1), np.log(df[col])
= df_model.sample(frac=0.04)
df = get_xy(df)
X, y = get_xy(df_model)
X_model, y_model = get_xy(df_valid)
X_valid, y_valid
= get_model_pipeline(max_n_cat,cat_dict)
pipeline pipeline.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
('randomforestregressor', RandomForestRegressor(n_jobs=-1))])
=get_scorer_dict()) get_score(pipeline, X, y, X_valid, y_valid, scorers
Training | Validation | |
---|---|---|
Metric | ||
r2 | 0.978088 | 0.821789 |
neg_root_mean_squared_error | -0.102343 | -0.305869 |
explained_variance | 0.978089 | 0.825985 |
neg_median_absolute_error | -0.055909 | -0.181245 |
neg_mean_absolute_percentage_error | -0.007455 | -0.022704 |
=get_scorer_dict()) get_score(pipeline, X_model, y_model, X_valid, y_valid, scorers
Training | Validation | |
---|---|---|
Metric | ||
r2 | 0.853482 | 0.821789 |
neg_root_mean_squared_error | -0.265110 | -0.305869 |
explained_variance | 0.853483 | 0.825985 |
neg_median_absolute_error | -0.144865 | -0.181245 |
neg_mean_absolute_percentage_error | -0.019361 | -0.022704 |
This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]
Cross validation estimate
= timeseries_cv(df_model, 'saledate', 'SalePrice',
df_cv ={'func': get_model_pipeline, 'func_kwargs':{'max_n_cat':0, 'cat_dict':cat_dict}},
pipeline_callback_dict=np.log,
y_mod_func= get_scorer_dict(),
scorers =15000, n_test=12000, n_splits=10)
n_train df_cv
100.00% [10/10 01:06<00:00]
Training | Validation | set | train_start | train_end | valid_start | valid_end | |
---|---|---|---|---|---|---|---|
Metric | |||||||
r2 | 0.986050 | 0.855443 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_root_mean_squared_error | -0.082520 | -0.262933 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
explained_variance | 0.986051 | 0.863314 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_median_absolute_error | -0.037583 | -0.148024 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_mean_absolute_percentage_error | -0.005600 | -0.019542 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
r2 | 0.984919 | 0.839832 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_root_mean_squared_error | -0.085098 | -0.280481 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
explained_variance | 0.984919 | 0.865230 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_median_absolute_error | -0.040576 | -0.167759 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_mean_absolute_percentage_error | -0.005859 | -0.021296 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
r2 | 0.985811 | 0.855938 | 3 | 2008-02-26 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_root_mean_squared_error | -0.083452 | -0.268049 | 3 | 2008-02-26 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
explained_variance | 0.985812 | 0.870222 | 3 | 2008-02-26 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_median_absolute_error | -0.039221 | -0.161301 | 3 | 2008-02-26 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_mean_absolute_percentage_error | -0.005723 | -0.020529 | 3 | 2008-02-26 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
r2 | 0.986149 | 0.859721 | 4 | 2008-02-29 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_root_mean_squared_error | -0.082841 | -0.263684 | 4 | 2008-02-29 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
explained_variance | 0.986149 | 0.860370 | 4 | 2008-02-29 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_median_absolute_error | -0.039276 | -0.152460 | 4 | 2008-02-29 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_mean_absolute_percentage_error | -0.005729 | -0.019758 | 4 | 2008-02-29 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
r2 | 0.985700 | 0.864896 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_root_mean_squared_error | -0.084180 | -0.254076 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
explained_variance | 0.985700 | 0.864995 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_median_absolute_error | -0.040860 | -0.144788 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_mean_absolute_percentage_error | -0.005935 | -0.018973 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
r2 | 0.984630 | 0.847664 | 6 | 2008-05-01 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_root_mean_squared_error | -0.086090 | -0.272201 | 6 | 2008-05-01 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
explained_variance | 0.984633 | 0.855422 | 6 | 2008-05-01 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_median_absolute_error | -0.040860 | -0.155152 | 6 | 2008-05-01 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_mean_absolute_percentage_error | -0.005967 | -0.019876 | 6 | 2008-05-01 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
r2 | 0.985190 | 0.843297 | 7 | 2008-05-15 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_root_mean_squared_error | -0.084891 | -0.278687 | 7 | 2008-05-15 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
explained_variance | 0.985192 | 0.844513 | 7 | 2008-05-15 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_median_absolute_error | -0.039577 | -0.151982 | 7 | 2008-05-15 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_mean_absolute_percentage_error | -0.005821 | -0.020361 | 7 | 2008-05-15 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
r2 | 0.984002 | 0.871116 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_root_mean_squared_error | -0.088833 | -0.253561 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
explained_variance | 0.984003 | 0.871649 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_median_absolute_error | -0.043196 | -0.143973 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_mean_absolute_percentage_error | -0.006185 | -0.018764 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
r2 | 0.985709 | 0.859266 | 9 | 2009-02-16 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_root_mean_squared_error | -0.084228 | -0.265014 | 9 | 2009-02-16 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
explained_variance | 0.985709 | 0.867493 | 9 | 2009-02-16 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_median_absolute_error | -0.041602 | -0.146238 | 9 | 2009-02-16 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_mean_absolute_percentage_error | -0.005938 | -0.019083 | 9 | 2009-02-16 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
r2 | 0.986866 | 0.860514 | 10 | 2009-05-06 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_root_mean_squared_error | -0.081017 | -0.263815 | 10 | 2009-05-06 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
explained_variance | 0.986871 | 0.860803 | 10 | 2009-05-06 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_median_absolute_error | -0.037374 | -0.151020 | 10 | 2009-05-06 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_mean_absolute_percentage_error | -0.005516 | -0.019471 | 10 | 2009-05-06 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
'neg_root_mean_squared_error'][['train_end', 'Training', 'Validation']].set_index('train_end').plot() df_cv.loc[
<AxesSubplot:xlabel='train_end'>
'neg_root_mean_squared_error'][[ 'Validation']].mean() df_cv.loc[
Validation -0.26625
dtype: float64
Tuning
Number of points to select for training to improve validation
This requires little bit of playing with how many points to choose for training
= 8000
n = df_model.sort_values(by='saledate').iloc[-n:]
df_model_train =1)
df_model_train.sample(frac= get_xy(df_model_train)
X_model_train , y_model_train = get_model_pipeline(max_n_cat, cat_dict)
pipeline
pipeline.fit(X_model_train, y_model_train)= get_score(pipeline, X_model_train, y_model_train, X_valid, y_valid, scorers=get_scorer_dict())
score_df 'n'] = n
score_df[ score_df
Training | Validation | n | |
---|---|---|---|
Metric | |||
r2 | 0.986471 | 0.848635 | 8000 |
neg_root_mean_squared_error | -0.081511 | -0.281891 | 8000 |
explained_variance | 0.986471 | 0.849255 | 8000 |
neg_median_absolute_error | -0.038784 | -0.157271 | 8000 |
neg_mean_absolute_percentage_error | -0.005624 | -0.020596 | 8000 |
20000, 5000,16) np.linspace(
array([20000., 19000., 18000., 17000., 16000., 15000., 14000., 13000.,
12000., 11000., 10000., 9000., 8000., 7000., 6000., 5000.])
def get_score_for_n(n=8000):
= df_model.sort_values(by='saledate').iloc[-n:]
df_model_train =1)
df_model_train.sample(frac= get_xy(df_model_train)
X_model_train , y_model_train = get_model_pipeline(max_n_cat, cat_dict)
pipeline
pipeline.fit(X_model_train, y_model_train)= get_score(pipeline, X_model_train, y_model_train, X_valid, y_valid, scorers=get_scorer_dict())
score_df 'n'] = n
score_df[return score_df
= pd.concat([get_score_for_n(n=int(n)) for n in np.linspace(20000, 5000,16)]); scores scores
Training | Validation | n | |
---|---|---|---|
Metric | |||
r2 | 0.987612 | 0.869907 | 20000 |
neg_root_mean_squared_error | -0.078250 | -0.261333 | 20000 |
explained_variance | 0.987613 | 0.870216 | 20000 |
neg_median_absolute_error | -0.037010 | -0.147194 | 20000 |
neg_mean_absolute_percentage_error | -0.005343 | -0.019104 | 20000 |
... | ... | ... | ... |
r2 | 0.985790 | 0.833147 | 5000 |
neg_root_mean_squared_error | -0.083945 | -0.295961 | 5000 |
explained_variance | 0.985790 | 0.834022 | 5000 |
neg_median_absolute_error | -0.038053 | -0.166622 | 5000 |
neg_mean_absolute_percentage_error | -0.005691 | -0.021768 | 5000 |
80 rows × 3 columns
'neg_root_mean_squared_error'] scores.loc[
Training | Validation | n | |
---|---|---|---|
Metric | |||
neg_root_mean_squared_error | -0.078250 | -0.261333 | 20000 |
neg_root_mean_squared_error | -0.078630 | -0.260586 | 19000 |
neg_root_mean_squared_error | -0.079291 | -0.261858 | 18000 |
neg_root_mean_squared_error | -0.080207 | -0.262133 | 17000 |
neg_root_mean_squared_error | -0.080376 | -0.265427 | 16000 |
neg_root_mean_squared_error | -0.081357 | -0.265461 | 15000 |
neg_root_mean_squared_error | -0.081260 | -0.267446 | 14000 |
neg_root_mean_squared_error | -0.080965 | -0.270179 | 13000 |
neg_root_mean_squared_error | -0.080303 | -0.270147 | 12000 |
neg_root_mean_squared_error | -0.081419 | -0.273167 | 11000 |
neg_root_mean_squared_error | -0.081806 | -0.276547 | 10000 |
neg_root_mean_squared_error | -0.081851 | -0.279289 | 9000 |
neg_root_mean_squared_error | -0.082139 | -0.281973 | 8000 |
neg_root_mean_squared_error | -0.081183 | -0.284795 | 7000 |
neg_root_mean_squared_error | -0.081215 | -0.285585 | 6000 |
neg_root_mean_squared_error | -0.083945 | -0.295961 | 5000 |
Conclusion is we can try 2 models
- Add 19000 points from df_model with df_valid
- Add df_model with df_valid and take last 19000 points
Predictions
= df_model.sort_values(by='saledate').iloc[-19000:]
df_model_train = pd.concat([df_model_train, df_valid])
df_sel_train =1)
df_sel_train.sample(frac= get_xy(df_sel_train)
X_sel_train, y_sel_train
= get_model_pipeline(max_n_cat, cat_dict)
pipeline pipeline.fit(X_sel_train, y_sel_train)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object>),
('randomforestregressor', RandomForestRegressor(n_jobs=-1))])
plt.hist(np.exp(y_sel_train))
(array([10186., 8235., 4265., 2565., 1941., 1222., 922., 350.,
166., 60.]),
array([ 4750., 18475., 32200., 45925., 59650., 73375., 87100.,
100825., 114550., 128275., 142000.]),
<BarContainer object of 10 artists>)
plt.hist(np.exp(pipeline.predict(df_test)))
(array([3914., 2885., 1674., 1086., 809., 575., 409., 141., 48.,
32.]),
array([ 5632.98698052, 17963.68207264, 30294.37716476, 42625.07225689,
54955.76734901, 67286.46244114, 79617.15753326, 91947.85262538,
104278.54771751, 116609.24280963, 128939.93790175]),
<BarContainer object of 10 artists>)
= pd.DataFrame()
df_submission 'Id'] = df_test['SalesID']
df_submission['Predicted'] = np.exp(pipeline.predict(df_test))
df_submission['submission.csv', index=False) df_submission.to_csv(
if not is_kaggle:
import kaggle
from aiking.integrations.kaggle import push2kaggle
# kaggle.api.competition_submit_cli("submission.csv", "Submission from local machine", competition="bluebook-bulldozer-remix")
"00_index.ipynb") push2kaggle(