4.4 Robust Regression Models

4.4.1 Local Regression (loess)

In[]:=
session=StartExternalSession["Python"]
Out[]=
ExternalSessionObject
System: Python
EvaluationCount:
None
UUID: a46f29bc-3368-4c88-866c-95d0551372e4

Test
〉
In[]:=
5+6
Out[]=
11
In[]:=
wind=Import["E:\\wind.dat"]//Flatten;
In[]:=
ozone=Import["E:\\ozone.dat"]//Flatten;
In[]:=
data=Transpose[{wind,ozone}];
In[]:=
pdata=ListPlot[data,AspectRatio1,PlotRangeAll,AxesOrigin{0,0},PlotStylePointSize[0.02],FrameTrue,FrameLabel{"wind","ozone concentration"}]
Out[]=
In[]:=
T=Compile[{u},If[Abs[u]<1,(1-Abs[u]^3)^3,0]];​​ww=Compile{{i,_Integer},{x,_Real},{xx,_Real,1},{q,_Integer}},TAbs[xx〚i〛-x]Sort[Abs[xx-x]]q;​​
In[]:=
Plot[T[u],{u,-1,1},FrameTrue,FrameLabel{"Normalized data value","Local weight"}]
Out[]=
In[]:=
localRegress[data_,localpols_,α_,λ_]:=Module[{xx,ff,a,b,n,x,q,xwei,y},​​{xx,ff}=data;a=Min[xx];b=Max[xx];n=Length[xx];​​x=Range[a,b,(b-a)/(localpols-1)];q=Floor[αn];​​xwei={#,Table[ww[i,#,xx,q],{i,n}]}&/@x;​​Interpolation[{#〚1〛,Normal[LinearModelFit[data,y^Range[0,λ],y,Weights#〚2〛+10^-15]]/.y#〚1〛}&/@xwei]]
In[]:=
{fit1,fit2}={localRegress[data,20,0.9,1],localRegress[data,20,0.9,2]};
In[]:=
GraphicsGrid[{Map[Show[{pdata,Plot[#[x],{x,2,21},PlotStyle{Thin,Red}]//Quiet}]&,{fit1,fit2}]}]
Out[]=
In[]:=
Export["E:\\wind.mtx",{wind}];
In[]:=
Export["E:\\ozone.mtx",{N[ozone]}];
〉
In[]:=
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
〉
In[]:=
import numpy as np
import pandas as pd
import scipy
​
​
def loc_eval(x, b):
"""
Evaluate `x` using locally-weighted regression parameters.
Degree of polynomial used in loess is inferred from b. `x`
is assumed to be a scalar.
"""
loc_est = 0
for i in enumerate(b): loc_est+=i[1]*(x**i[0])
return(loc_est)
​
def loess(xvals, yvals, alpha, poly_degree=1):
"""
Perform locally-weighted regression on xvals & yvals.
Variables used inside `loess` function:
​
n => number of data points in xvals
m => nbr of LOESS evaluation points
q => number of data points used for each
locally-weighted regression
v => x-value locations for evaluating LOESS
locsDF => contains local regression details for each
location v
evalDF => contains actual LOESS output for each v
X => n-by-(poly_degree+1) design matrix
W => n-by-n diagonal weight matrix for each
local regression
y => yvals
b => local regression coefficient estimates.
b = `(X^T*W*X)^-1*X^T*W*y`. Note that `@`
replaces `np.dot` in recent numpy versions.
local_est => response for local regression
"""
# Sort dataset by xvals.
all_data = sorted(zip(xvals, yvals), key=lambda x: x[0])
xvals, yvals = zip(*all_data)
​
locsDF = pd.DataFrame(
columns=[
'loc','x','weights','v','y','raw_dists',
'scale_factor','scaled_dists'
])
evalDF = pd.DataFrame(
columns=[
'loc','est','b','v','g'
])
​
n = len(xvals)
m = n + 1
q = int(np.floor(n * alpha) if alpha <= 1.0 else n)
avg_interval = ((max(xvals)-min(xvals))/len(xvals))
v_lb = max(0,min(xvals)-(.5*avg_interval))
v_ub = (max(xvals)+(.5*avg_interval))
v = enumerate(np.linspace(start=v_lb, stop=v_ub, num=m), start=1)
​
# Generate design matrix based on poly_degree.
xcols = [np.ones_like(xvals)]
for j in range(1, (poly_degree + 1)):
xcols.append([i ** j for i in xvals])
X = np.vstack(xcols).T
​
for i in v:
​
iterpos = i[0]
iterval = i[1]
​
# Determine q-nearest xvals to iterval.
iterdists = sorted([(j, np.abs(j-iterval)) \
for j in xvals], key=lambda x: x[1])
​
_, raw_dists = zip(*iterdists)
​
# Scale local observations by qth-nearest raw_dist.
scale_fact = raw_dists[q-1]
scaled_dists = [(j[0],(j[1]/scale_fact)) for j in iterdists]
weights = [(j[0],((1-np.abs(j[1]**3))**3 \
if j[1]<=1 else 0)) for j in scaled_dists]
​
# Remove xvals from each tuple:
_, weights = zip(*sorted(weights, key=lambda x: x[0]))
_, raw_dists = zip(*sorted(iterdists, key=lambda x: x[0]))
_, scaled_dists = zip(*sorted(scaled_dists,key=lambda x: x[0]))
​
iterDF1 = pd.DataFrame({
'loc' :iterpos,
'x' :xvals,
'v' :iterval,
'weights' :weights,
'y' :yvals,
'raw_dists' :raw_dists,
'scale_fact' :scale_fact,
'scaled_dists':scaled_dists
})
​
locsDF = pd.concat([locsDF, iterDF1])
W = np.diag(weights)
y = yvals
b = np.linalg.inv(X.T @ W @ X) @ (X.T @ W @ y)
local_est = loc_eval(iterval, b)
iterDF2 = pd.DataFrame({
'loc':[iterpos],
'b' :[b],
'v' :[iterval],
'g' :[local_est]
})
​
evalDF = pd.concat([evalDF, iterDF2])
​
# Reset indicies for returned DataFrames.
locsDF.reset_index(inplace=True)
locsDF.drop('index', axis=1, inplace=True)
locsDF['est'] = 0; evalDF['est'] = 0
locsDF = locsDF[['loc','est','v','x','y','raw_dists',
'scale_fact','scaled_dists','weights']]
​
# Reset index for evalDF.
evalDF.reset_index(inplace=True)
evalDF.drop('index', axis=1, inplace=True)
evalDF = evalDF[['loc','est', 'v', 'b', 'g']]
​
return(locsDF, evalDF)
​
​
RegularExpression
:Recursion limit exceeded; positive match might be missed.
〉
In[]:=
import numpy as np
〉
In[]:=
from numpy import array, matrix
〉
In[]:=
from scipy.io import mmread, mmwrite
〉
In[]:=
xv=mmread('E:\\wind.mtx')
〉
In[]:=
xvals=xv[0]
〉
In[]:=
yv=mmread('E:\\ozone.mtx')
〉
In[]:=
yvals=yv[0]
〉
In[]:=
regsDF, evalDF = loess(xvals, yvals, alpha=.9, poly_degree=1)
〉
In[]:=
l_x=evalDF['v'].values
l_x
Out[]=
NumericArray
Type: Real64
Dimensions: {113}

In[]:=
Normal[%]
Out[]=
{2.13423,2.30148,2.46873,2.63597,2.80322,2.97046,3.13771,3.30495,3.4722,3.63945,3.80669,3.97394,4.14118,4.30843,4.47568,4.64292,4.81017,4.97741,5.14466,5.3119,5.47915,5.6464,5.81364,5.98089,6.14813,6.31538,6.48263,6.64987,6.81712,6.98436,7.15161,7.31885,7.4861,7.65335,7.82059,7.98784,8.15508,8.32233,8.48958,8.65682,8.82407,8.99131,9.15856,9.3258,9.49305,9.6603,9.82754,9.99479,10.162,10.3293,10.4965,10.6638,10.831,10.9983,11.1655,11.3328,11.5,11.6672,11.8345,12.0017,12.169,12.3362,12.5035,12.6707,12.838,13.0052,13.1725,13.3397,13.5069,13.6742,13.8414,14.0087,14.1759,14.3432,14.5104,14.6777,14.8449,15.0122,15.1794,15.3467,15.5139,15.6811,15.8484,16.0156,16.1829,16.3501,16.5174,16.6846,16.8519,17.0191,17.1864,17.3536,17.5208,17.6881,17.8553,18.0226,18.1898,18.3571,18.5243,18.6916,18.8588,19.0261,19.1933,19.3606,19.5278,19.695,19.8623,20.0295,20.1968,20.364,20.5313,20.6985,20.8658}
In[]:=
u=%;
〉
In[]:=
l_y=evalDF['g'].values
l_y
Out[]=
NumericArray
Type: Real64
Dimensions: {113}

Fig.4.4.4 Local regression with polynomial of first degree
〉
In[]:=
​
regsDF, evalDF = loess(xvals, yvals, alpha=.9, poly_degree=2)
〉
In[]:=
l_x=evalDF['v'].values
l_x
〉
In[]:=
l_y=evalDF['g'].values
l_y

4.4.2 Expectation Maximization

The result of the parameter values
〉
In[]:=
​
from numpy import array, matrix
from scipy.io import mmread, mmwrite
〉
In[]:=
caki=mmread('E:\cuki.mtx')
〉
In[]:=
import numpy as np
from sklearn.mixture import GaussianMixture
〉
In[]:=
gmm=GaussianMixture(n_components=2).fit(caki)
〉
In[]:=
labels=gmm.predict(caki)
labels

4.4.3 Maximum Likelihood Estimation

〉
In[]:=
​
from numpy import array, matrix
from scipy.io import mmread, mmwrite
〉
In[]:=
xcaki=mmread('E:\zcuki.mtx')
ycaki=mmread('E:\ycuki.mtx')
〉
In[]:=
import numpy as np
from scipy.optimize import minimize
import scipy.stats as stats
〉
In[]:=
def regressLL(params):
a0 = params[0]
a1 = params[1]
a2 = params[2]
sd = params[3]
yPred=a0+ a1*xcaki**0.5+a2*xcaki
logLik = -np.sum( stats.norm.logpdf(ycaki, loc=yPred, scale=sd) )
return(logLik)
〉
In[]:=
initParams=np.array([1,2,0,1])
〉
In[]:=
results = minimize(regressLL, initParams, method='nelder-mead')
〉
In[]:=
results.x

4.4.4 RANSAC for Linear Models

〉
In[]:=
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/'
'python-machine-learning-book-2nd-edition'
'/master/code/ch10/housing.data.txt',
header=None,
sep='\s+')

df.columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']

X = df[['LSTAT']].values
y = df['MEDV'].values
〉
In[]:=
np.savetxt('E:\\daTaX.txt',X,fmt='%.5e')
〉
In[]:=
np.savetxt('E:\\daTay.txt',y,fmt='%.5e')
〉
In[]:=
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
〉
In[]:=
ransac=RANSACRegressor(LinearRegression(),max_trials=100,min_samples=50,loss='absolute_loss',residual_threshold=5.0,random_state=0).fit(X,y)
〉
In[]:=
inlier_mask=ransac.inlier_mask_
〉
In[]:=
np.savetxt('E:\\maskedX.txt',X[inlier_mask],fmt='%.5e')
〉
In[]:=
np.savetxt('E:\\maskedy.txt',y[inlier_mask],fmt='%.5e')