r tp1 : correction du tp 1 - upmc · r_tp1 : correction du tp 1 partie1...
TRANSCRIPT
R_TP1 : correction du TP 1
Partie 1
Dans le modèle linéaire gaussien
Simulation
n = 20X = c(1:20)epsilon = rnorm(n , 1, sqrt(2))Y = 2 + 4*X + epsilon
Graphique
plot(X,Y)
5 10 15 20
2040
6080
X
Y
Residus studentisés e? / individus aberrants
fit = lm(Y~X)e_star = rstudent(fit)print(c(1:n)[abs(e_star)>2])
## [1] 1 17
1
Leviers, observations isolées
influences = lm.influence(fit)hat = influences$hat#cutoffcutoff = 4 / nwhich(hat > cutoff)
## named integer(0)
Dans le modèle linéaire gaussien avec une observation isolée
Simulation
n = 20X = c(1:19)X = c(X,30)epsilon = rnorm(n , 1, sqrt(2))Y = 2 + 4*X + epsilon
Graphique
plot(X,Y)
0 5 10 15 20 25 30
2040
6080
100
120
X
Y
2
Residus studentisés e? / individus aberrants
fit = lm(Y~X)e_star = rstudent(fit)which(abs(e_star)>2)
## 13 15## 13 15
Leviers, observations isolées
influences = lm.influence(fit)hat = influences$hat#cutoffcutoff = 4 / nwhich(hat > cutoff)
## 20## 20
Dans le modèle linéaire gaussien avec une observation aberrante
Simulation
n = 20X = c(1:20)epsilon = rnorm(n , 1, sqrt(2))Y = 2 + 4*X + epsilonY[20] = Y[20] - 10
Graphique
plot(X,Y)
3
5 10 15 20
2040
6080
X
Y
Residus studentisés e? / individus aberrants
fit = lm(Y~X)e_star = rstudent(fit)which(abs(e_star)>2)
## 20## 20
Leviers, observations isolées
influences = lm.influence(fit)hat = influences$hat#cutoffcutoff = 4 / nwhich(hat > cutoff)
## named integer(0)
Partie 2
Avant de commencer
Faire pointer R vers votre répertoire
4
setwd("~/Dropbox/Modelelineaire/TP/")
(Installer et) charger les librairies faraway et dplyr
library(faraway)library(dplyr)
#### Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':#### filter, lag
## The following objects are masked from 'package:base':#### intersect, setdiff, setequal, union
la librairie dplyr permet de manipuler les données facilement voir la cheat sheet https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
données “Ozone”
load("ozone.Rdata")attach(ozone)glimpse(ozone)
## Observations: 112## Variables: 13## $ maxO3 <dbl> 87, 82, 92, 114, 94, 80, 79, 79, 101, 106, 101, 90, 72,...## $ T9 <dbl> 15.6, 17.0, 15.3, 16.2, 17.4, 17.7, 16.8, 14.9, 16.1, 1...## $ T12 <dbl> 18.5, 18.4, 17.6, 19.7, 20.5, 19.8, 15.6, 17.5, 19.6, 2...## $ T15 <dbl> 18.4, 17.7, 19.5, 22.5, 20.4, 18.3, 14.9, 18.9, 21.4, 2...## $ Ne9 <dbl> 4, 5, 2, 1, 8, 6, 7, 5, 2, 5, 7, 7, 7, 7, 8, 6, 0, 8, 2...## $ Ne12 <dbl> 4, 5, 5, 1, 8, 6, 8, 5, 4, 6, 7, 6, 5, 7, 7, 5, 1, 3, 1...## $ Ne15 <dbl> 8, 7, 4, 0, 7, 7, 8, 4, 4, 8, 3, 8, 6, 7, 7, 4, 1, 1, 0...## $ Vx9 <dbl> 0.6946, -4.3301, 2.9544, 0.9848, -0.5000, -5.6382, -4.3...## $ Vx12 <dbl> -1.7101, -4.0000, 1.8794, 0.3473, -2.9544, -5.0000, -1....## $ Vx15 <dbl> -0.6946, -3.0000, 0.5209, -0.1736, -4.3301, -6.0000, -3...## $ maxO3v <dbl> 84, 87, 82, 92, 114, 94, 80, 99, 79, 101, 106, 101, 90,...## $ vent <fctr> Nord, Nord, Est, Nord, Ouest, Ouest, Ouest, Nord, Nord...## $ pluie <fctr> Sec, Sec, Sec, Sec, Sec, Pluie, Sec, Sec, Sec, Sec, Se...
n = nrow(ozone)
On vérifie que R a bien pris en compte les types de variables (numérique, facteur, etc)
5
scatterplot
pairs(~ maxO3+T9+T12+T15+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v +vent+pluie, data = ozone, main="Simple Scatterplot Matrix")
maxO3
15 15 35 0 6 −8 2 −8 2 1.0 4.0
40
15 T9
T12
15
15
T15
Ne9 0
0 Ne12
Ne15 0
−8 Vx9
Vx12
−5
−8 Vx15
maxO3v
40
1.0 vent
40 160 15 0 6 0 6 −5 40 160 1.0 2.0
1.0pluie
Simple Scatterplot Matrix
Modèle avec les toutes covariables (sauf Vent et Pluie) + intercept
fit = lm( maxO3 ~ T9+T12+T15+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v)summary(fit)
#### Call:## lm(formula = maxO3 ~ T9 + T12 + T15 + Ne9 + Ne12 + Ne15 + Vx9 +## Vx12 + Vx15 + maxO3v)#### Residuals:## Min 1Q Median 3Q Max## -53.566 -8.727 -0.403 7.599 39.458#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 12.24442 13.47190 0.909 0.3656## T9 -0.01901 1.12515 -0.017 0.9866## T12 2.22115 1.43294 1.550 0.1243## T15 0.55853 1.14464 0.488 0.6266## Ne9 -2.18909 0.93824 -2.333 0.0216 *
6
## Ne12 -0.42102 1.36766 -0.308 0.7588## Ne15 0.18373 1.00279 0.183 0.8550## Vx9 0.94791 0.91228 1.039 0.3013## Vx12 0.03120 1.05523 0.030 0.9765## Vx15 0.41859 0.91568 0.457 0.6486## maxO3v 0.35198 0.06289 5.597 1.88e-07 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 14.36 on 101 degrees of freedom## Multiple R-squared: 0.7638, Adjusted R-squared: 0.7405## F-statistic: 32.67 on 10 and 101 DF, p-value: < 2.2e-16
Diagnostics sur la linéarité : résidus partiels
Residus studentisés e?
library(car)
#### Attaching package: 'car'
## The following object is masked from 'package:dplyr':#### recode
## The following objects are masked from 'package:faraway':#### logit, vif
crPlots(fit)
7
15 20 25
−40
040
T9Com
pone
nt+
Res
idua
l(max
O3)
15 20 25 30
−40
040
T12Com
pone
nt+
Res
idua
l(max
O3)
15 20 25 30 35
−40
040
T15Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8
−40
040
Ne9Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8−
400
40
Ne12Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8
−40
040
Ne15Com
pone
nt+
Res
idua
l(max
O3)
−8 −4 0 2 4
−40
040
Vx9Com
pone
nt+
Res
idua
l(max
O3)
−5 0 5
−40
040
Vx12Com
pone
nt+
Res
idua
l(max
O3)
−8 −4 0 2 4
−40
040
Vx15Com
pone
nt+
Res
idua
l(max
O3)
40 80 120 160
−20
20
maxO3vCom
pone
nt+
Res
idua
l(max
O3)
Component + Residual Plots
Ce n’est pas parfait pour les variables Vx et les tempétures, donc elles sont peut-être à transformer, voiraprès le diagnostic de multicolinéarité
Diagnostic pour la multicolinéarité
X = ozone[,c(2:11)]cosinus = cor(X)
Calcul des valeurs propres et vecteurs propres
propres = eigen(cosinus)propres$values[1]/ propres$values
## [1] 1.000000 2.978073 6.121403 9.498678 13.947277 16.572784## [7] 30.236739 34.548379 63.734767 170.905064
8
row.names(propres$vectors) = colnames(X)propres$vectors
## [,1] [,2] [,3] [,4] [,5]## T9 -0.3096065 -0.40963366 -0.19164545 0.29692679 0.09008952## T12 -0.3615383 -0.32364167 -0.07323583 0.24286497 0.17120264## T15 -0.3691464 -0.29539993 -0.02314353 0.10089809 0.32899350## Ne9 0.3391999 -0.08585786 -0.34980361 -0.29468584 0.59681282## Ne12 0.3568045 -0.03009641 -0.42944390 0.06176426 0.27483104## Ne15 0.2982936 -0.06125711 -0.49896558 0.55607438 -0.43786526## Vx9 -0.2996933 0.34045655 -0.30623480 -0.04439494 0.12788273## Vx12 -0.2893931 0.46092648 -0.20011708 0.06527457 0.07255336## Vx15 -0.2637260 0.47819016 -0.22954662 0.10843078 0.05003552## maxO3v -0.2504075 -0.26671276 -0.46379240 -0.65177331 -0.45472150## [,6] [,7] [,8] [,9] [,10]## T9 -0.27821836 0.09466338 -0.28862446 0.63934388 -0.15780744## T12 0.10680905 -0.19120759 0.21217785 -0.18898439 0.73633272## T15 0.08035330 0.07964563 0.10493479 -0.54163219 -0.58338687## Ne9 -0.02279269 -0.52682902 0.08726227 0.14778801 -0.06826178## Ne12 -0.13357923 0.69261772 -0.13504514 -0.21018154 0.21917938## Ne15 0.15795633 -0.28250361 0.08412479 -0.13346377 -0.16646039## Vx9 0.75283065 0.18027970 0.02076803 0.28462186 -0.03837886## Vx12 -0.21913407 -0.27356480 -0.66745727 -0.28098860 0.08315996## Vx15 -0.48719105 0.05855396 0.61764693 0.09657372 -0.05685442## maxO3v -0.09637299 0.01707700 0.01783386 -0.09506857 0.02326839
La matrice est mal conditionnée (κ = 170.90), on regarde dans la direction propre de la plus petite valeurpropre. Les variables qui contribuent à cette direction sont T12 et T15
Variance inflation factors
vif(fit)
## T9 T12 T15 Ne9 Ne12 Ne15 Vx9## 6.645125 18.060645 14.477998 3.190681 5.242639 2.944332 3.105152## Vx12 Vx15 maxO3v## 4.684647 3.564266 1.702196
on enlève une variable de température, par exemple T15
on recommence pour voir s’il a d’autres problèmes de colinéarité
X2 = ozone[,c(2:3,5:11)]cosinus2 = cor(X2)propres2 = eigen(cosinus2)propres2$values[1]/ propres2$values
## [1] 1.000000 2.889979 5.348564 8.375523 13.691391 14.813202 26.578352## [8] 30.519160 79.320311
9
row.names(propres2$vectors) = colnames(X2)propres2$vectors
## [,1] [,2] [,3] [,4] [,5]## T9 -0.3014244 -0.483340657 -0.2014031 0.33031215 -0.3547365## T12 -0.3591360 -0.393830071 -0.0804561 0.25863513 -0.2161581## Ne9 0.3707452 -0.009626102 -0.3446952 -0.34137764 -0.5320107## Ne12 0.3859304 0.052898671 -0.4229116 0.03055615 -0.2613863## Ne15 0.3209318 -0.005946441 -0.4962124 0.57787371 0.4611151## Vx9 -0.3386568 0.309010547 -0.3031162 -0.05985390 0.2049740## Vx12 -0.3377447 0.429942195 -0.1958839 0.05750734 -0.1662350## Vx15 -0.3112889 0.453959063 -0.2245350 0.10039963 -0.2364438## maxO3v -0.2551714 -0.349194434 -0.4779141 -0.59634025 0.3713466## [,6] [,7] [,8] [,9]## T9 0.08357654 -0.163304329 0.204983334 0.56951414## T12 -0.26212846 0.170975943 -0.240300507 -0.66343134## Ne9 -0.24255450 0.507503560 -0.083430531 0.14392121## Ne12 0.03619172 -0.696899755 0.098695925 -0.32160336## Ne15 0.05868414 0.314251466 -0.029318621 0.03343776## Vx9 -0.74680937 -0.207841640 -0.073314201 0.21459364## Vx12 0.15840973 0.242264907 0.704471891 -0.22737627## Vx15 0.43751688 -0.014532878 -0.617247714 0.08939612## maxO3v 0.29407206 0.003764207 0.005101048 -0.06715894
fit2 = lm( maxO3 ~ T9+T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v)vif(fit2)
## T9 T12 Ne9 Ne12 Ne15 Vx9 Vx12 Vx15## 6.553571 8.396044 3.167582 4.895314 2.067436 3.066914 4.676930 3.525638## maxO3v## 1.697845
on peut aussi enlèver T9 mais ce n’est pas obligatoire
X3 = ozone[,c(3,5:11)]cosinus3 = cor(X3)propres3 = eigen(cosinus3)propres3$values[1]/ propres3$values
## [1] 1.000000 3.371578 5.212822 8.858504 13.606211 16.554838 25.459573## [8] 31.790846
row.names(propres3$vectors) = colnames(X3)propres3$vectors
## [,1] [,2] [,3] [,4] [,5] [,6]## T12 0.3367620 0.4418044 0.16876993 -0.38996185 0.3591047 0.59512107## Ne9 -0.3861669 -0.1112980 0.33904407 0.44796693 0.3805744 0.38892949## Ne12 -0.4028113 -0.2372386 0.33638587 0.06127312 0.0366145 0.11452642## Ne15 -0.3412110 -0.1895938 0.38813424 -0.75061047 -0.1602026 -0.09455094## Vx9 0.3715166 -0.2980341 0.27953334 -0.05122951 0.6460134 -0.47736175
10
## Vx12 0.3753541 -0.4505031 0.08862525 0.04940848 -0.1072905 0.11867371## Vx15 0.3494203 -0.4893557 0.10176524 0.02217244 -0.3418091 0.42703863## maxO3v 0.2403638 0.4110320 0.70456561 0.27299750 -0.3918207 -0.21951971## [,7] [,8]## T12 0.09335529 0.13786303## Ne9 -0.41160383 -0.23946179## Ne12 0.65178051 0.47463277## Ne15 -0.28090809 -0.14160109## Vx9 0.14622434 -0.16061953## Vx12 -0.46179197 0.63797656## Vx15 0.28811540 -0.49536120## maxO3v -0.02227147 -0.01136917
fit3 = lm( maxO3 ~ T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v)vif(fit3)
## T12 Ne9 Ne12 Ne15 Vx9 Vx12 Vx15 maxO3v## 2.417951 2.975058 4.331558 2.067268 2.637446 4.434996 3.501469 1.530246
On refait les résidus partiels pour ce problème de linéarité
crPlots(fit3)
15 20 25 30
−40
040
T12Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8
−40
040
Ne9Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8
−40
040
Ne12Com
pone
nt+
Res
idua
l(max
O3)
0 2 4 6 8
−40
040
Ne15Com
pone
nt+
Res
idua
l(max
O3)
−8 −4 0 2 4
−40
040
Vx9Com
pone
nt+
Res
idua
l(max
O3)
−5 0 5
−40
040
Vx12Com
pone
nt+
Res
idua
l(max
O3)
−8 −4 0 2 4
−40
040
Vx15Com
pone
nt+
Res
idua
l(max
O3)
40 80 120 160
−20
20
maxO3vCom
pone
nt+
Res
idua
l(max
O3)
Component + Residual Plots
àrevoir après les diagnostics sur les observations influentes/aberrantes
11
Observations aberrantes/isolées
Graphiques
par(mfrow=c(1,1))plot(fit3,which=1:6,labels.id = rownames(ozone))
60 80 100 120 140 160
−60
−40
−20
020
40
Fitted values
Res
idua
ls
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Residuals vs Fitted
20010731
20010707 20010824
12
−2 −1 0 1 2
−4
−2
02
Theoretical Quantiles
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Normal Q−Q
20010731
2001082420010707
60 80 100 120 140 160
0.0
0.5
1.0
1.5
2.0
Fitted values
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Scale−Location20010731
2001082420010707
13
0 20 40 60 80 100
0.00
0.05
0.10
0.15
0.20
0.25
Obs. number
Coo
k's
dist
ance
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance
20010731
20010824
20010621
0.00 0.05 0.10 0.15
−4
−2
02
Leverage
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance0.5
Residuals vs Leverage
20010731
20010824
20010621
14
0.00
0.05
0.10
0.15
0.20
0.25
Leverage hii
Coo
k's
dist
ance
0 0.05 0.1 0.15
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
0
1
2345
Cook's dist vs Leverage hii (1 − hii)20010731
20010824
20010621
il ya des observations influentes, on les cherche
Résidus standardisés
e_star = rstudent(fit3)print(rownames(ozone)[abs(e_star) > 2])
## [1] "20010621" "20010707" "20010725" "20010731" "20010824"
print(e_star[abs(e_star) > 2])
## 18 34 52 58 79## -2.022151 3.067923 2.934634 -4.335027 3.070085
on commence par enlever “20010707” “20010731” “20010824”
leviers
influences = lm.influence(fit3)hat = influences$hat#cutoffcutoff = (2 * 9)/ nprint(rownames(ozone)[hat > cutoff])
## [1] "20010621" "20010717" "20010727" "20010923"
15
print(hat[hat > cutoff])
## 18 44 54 106## 0.1658886 0.1653847 0.1683812 0.1667629
on enlève “20010621” aussi qui a un grand résidus et un grand levier
Modèle sans ces observations
summary(fit3)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v)#### Residuals:## Min 1Q Median 3Q Max## -53.404 -8.429 -0.928 7.662 40.952#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 13.17010 13.21552 0.997 0.3213## T12 2.76616 0.51981 5.321 6.04e-07 ***## Ne9 -2.23742 0.89820 -2.491 0.0143 *## Ne12 -0.23101 1.23249 -0.187 0.8517## Ne15 -0.08300 0.83305 -0.100 0.9208## Vx9 0.98351 0.83355 1.180 0.2408## Vx12 0.06185 1.01791 0.061 0.9517## Vx15 0.36901 0.89979 0.410 0.6826## maxO3v 0.35433 0.05912 5.994 3.05e-08 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 14.24 on 103 degrees of freedom## Multiple R-squared: 0.7633, Adjusted R-squared: 0.7449## F-statistic: 41.51 on 8 and 103 DF, p-value: < 2.2e-16
ozone4 = dplyr::slice(ozone,-c(18,34,58,79))
on utilise “dplyr::select” au lieu de “select” pour eviter les collisions sur le nom select (la fonction existe aussidans le package MASS). Ce n’est pas nécessaire ici mais c’est bon à savoir. . . les fonctions dplyr ignorent lesrownames, il faut les rajouter si besoin,
rownames(ozone4) = rownames(ozone)[-c(18,34,58,79)]
ou mieux les mettre avant dans une colonne ID
16
ozone$ID = rownames(ozone)ozone4 = dplyr::slice(ozone,-c(18,34,58,79))
fit4 = lm( maxO3 ~ T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v, data = ozone4)summary(fit4)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v, data = ozone4)#### Residuals:## Min 1Q Median 3Q Max## -23.426 -8.425 -0.746 7.561 41.290#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 17.28270 11.01796 1.569 0.11993## T12 2.49464 0.43620 5.719 1.14e-07 ***## Ne9 -2.24531 0.79565 -2.822 0.00577 **## Ne12 -1.12074 1.09956 -1.019 0.31056## Ne15 0.21056 0.70219 0.300 0.76491## Vx9 0.62032 0.69755 0.889 0.37601## Vx12 0.33046 0.86158 0.384 0.70213## Vx15 0.28940 0.76323 0.379 0.70537## maxO3v 0.40705 0.05164 7.882 4.30e-12 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 11.81 on 99 degrees of freedom## Multiple R-squared: 0.8311, Adjusted R-squared: 0.8174## F-statistic: 60.88 on 8 and 99 DF, p-value: < 2.2e-16
le R2 est beaucoup augmenté, on vérifie graphiquement que tout va bien
plot(fit4,which=1:6,labels.id = ozone4$ID)
17
60 80 100 120 140 160
−20
020
40
Fitted values
Res
idua
ls
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Residuals vs Fitted
20010725
2001071920010705
−2 −1 0 1 2
−2
−1
01
23
4
Theoretical Quantiles
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Normal Q−Q
20010725
2001071920010705
18
60 80 100 120 140 160
0.0
0.5
1.0
1.5
Fitted values
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Scale−Location20010725
2001071920010705
0 20 40 60 80 100
0.00
0.02
0.04
0.06
0.08
Obs. number
Coo
k's
dist
ance
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance
20010725
2001072720010624
19
0.00 0.05 0.10 0.15
−2
−1
01
23
4
Leverage
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance
Residuals vs Leverage
20010725
20010727
20010624
0.00
0.02
0.04
0.06
0.08
Leverage hii
Coo
k's
dist
ance
0 0.05 0.1 0.15
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
0
1234
Cook's dist vs Leverage hii (1 − hii)20010725
2001072720010624
ca fait resortir l’obs 20010725
e_star = rstudent(fit4)print(ozone4$ID[abs(e_star) > 2])
## [1] "20010628" "20010630" "20010705" "20010719" "20010725"
20
print(e_star[abs(e_star) > 2])
## 24 26 31 44 50## -2.032183 -2.057244 2.085461 2.133593 3.838064
influences = lm.influence(fit4)hat = influences$hat#cutoffcutoff = (2 * 9)/ nrow(ozone4)print(ozone4$ID[hat > cutoff])
## [1] "20010717" "20010727" "20010828" "20010923"
print(hat[hat > cutoff])
## 42 52 79 102## 0.1689614 0.1777409 0.1671314 0.1685623
on enlève cette observation et on s’arrête là !!
summary(fit4)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v, data = ozone4)#### Residuals:## Min 1Q Median 3Q Max## -23.426 -8.425 -0.746 7.561 41.290#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 17.28270 11.01796 1.569 0.11993## T12 2.49464 0.43620 5.719 1.14e-07 ***## Ne9 -2.24531 0.79565 -2.822 0.00577 **## Ne12 -1.12074 1.09956 -1.019 0.31056## Ne15 0.21056 0.70219 0.300 0.76491## Vx9 0.62032 0.69755 0.889 0.37601## Vx12 0.33046 0.86158 0.384 0.70213## Vx15 0.28940 0.76323 0.379 0.70537## maxO3v 0.40705 0.05164 7.882 4.30e-12 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 11.81 on 99 degrees of freedom## Multiple R-squared: 0.8311, Adjusted R-squared: 0.8174## F-statistic: 60.88 on 8 and 99 DF, p-value: < 2.2e-16
21
ozone5 = dplyr::filter(ozone4, ID != "20010725")
autre solution pour enlever les observations avec une condition sur une colonne via la fonction filter
fit5 = lm( maxO3 ~ T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v, data = ozone5)summary(fit5)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v, data = ozone5)#### Residuals:## Min 1Q Median 3Q Max## -22.9561 -8.2109 -0.2722 7.9270 24.9984#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 20.48193 10.35877 1.977 0.05082 .## T12 2.24280 0.41400 5.417 4.32e-07 ***## Ne9 -2.15053 0.74603 -2.883 0.00485 **## Ne12 -1.49622 1.03505 -1.446 0.15150## Ne15 0.40336 0.65996 0.611 0.54249## Vx9 0.43014 0.65556 0.656 0.51328## Vx12 0.47487 0.80828 0.588 0.55821## Vx15 0.26084 0.71528 0.365 0.71615## maxO3v 0.43236 0.04884 8.852 3.74e-14 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 11.07 on 98 degrees of freedom## Multiple R-squared: 0.8464, Adjusted R-squared: 0.8339## F-statistic: 67.52 on 8 and 98 DF, p-value: < 2.2e-16
Diagnostic sur les résidus
Graphiques
plot(fit5,which=1:6,labels.id = rownames(ozone5))
22
60 80 100 120 140 160
−20
−10
010
2030
Fitted values
Res
idua
ls
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Residuals vs Fitted
4431
24
−2 −1 0 1 2
−2
−1
01
2
Theoretical Quantiles
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Normal Q−Q
4431
24
23
60 80 100 120 140 160
0.0
0.5
1.0
1.5
Fitted values
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Scale−Location4431 24
0 20 40 60 80 100
0.00
0.02
0.04
Obs. number
Coo
k's
dist
ance
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance
20
5127
24
0.00 0.05 0.10 0.15
−2
−1
01
2
Leverage
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
Cook's distance
Residuals vs Leverage
20
51
27
0.00
0.01
0.02
0.03
0.04
0.05
Leverage hii
Coo
k's
dist
ance
0 0.05 0.1 0.15
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v)
0
0.511.522.5
Cook's dist vs Leverage hii (1 − hii)20
5127
Caparait pas mal. . .
25
on ajoute pluie et vent
summary(fit5)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v, data = ozone5)#### Residuals:## Min 1Q Median 3Q Max## -22.9561 -8.2109 -0.2722 7.9270 24.9984#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 20.48193 10.35877 1.977 0.05082 .## T12 2.24280 0.41400 5.417 4.32e-07 ***## Ne9 -2.15053 0.74603 -2.883 0.00485 **## Ne12 -1.49622 1.03505 -1.446 0.15150## Ne15 0.40336 0.65996 0.611 0.54249## Vx9 0.43014 0.65556 0.656 0.51328## Vx12 0.47487 0.80828 0.588 0.55821## Vx15 0.26084 0.71528 0.365 0.71615## maxO3v 0.43236 0.04884 8.852 3.74e-14 ***## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 11.07 on 98 degrees of freedom## Multiple R-squared: 0.8464, Adjusted R-squared: 0.8339## F-statistic: 67.52 on 8 and 98 DF, p-value: < 2.2e-16
fit6 = lm( maxO3 ~ T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v+pluie+vent, data = ozone5)summary(fit6)
#### Call:## lm(formula = maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 +## maxO3v + pluie + vent, data = ozone5)#### Residuals:## Min 1Q Median 3Q Max## -22.0927 -8.4341 -0.1493 7.5446 22.9718#### Coefficients:## Estimate Std. Error t value Pr(>|t|)## (Intercept) 21.76214 12.29805 1.770 0.08004 .## T12 2.14450 0.46879 4.574 1.46e-05 ***## Ne9 -2.20823 0.76399 -2.890 0.00478 **## Ne12 -1.29409 1.07592 -1.203 0.23209## Ne15 0.33473 0.67353 0.497 0.62037## Vx9 0.25207 0.71216 0.354 0.72417
26
## Vx12 0.09461 0.95435 0.099 0.92124## Vx15 0.39997 0.74580 0.536 0.59302## maxO3v 0.43079 0.05183 8.311 7.07e-13 ***## pluieSec 2.68524 2.68550 1.000 0.31992## ventNord -1.46604 5.16838 -0.284 0.77730## ventOuest -3.23420 6.36697 -0.508 0.61267## ventSud 0.69951 5.49859 0.127 0.89904## ---## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#### Residual standard error: 11.15 on 94 degrees of freedom## Multiple R-squared: 0.8505, Adjusted R-squared: 0.8314## F-statistic: 44.57 on 12 and 94 DF, p-value: < 2.2e-16
on revérifie que tout se passe bien pour les résidus
plot(fit6,which=1:6,labels.id = rownames(ozone5))
60 80 100 120 140 160
−20
−10
010
20
Fitted values
Res
idua
ls
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
Residuals vs Fitted
3144
26
27
−2 −1 0 1 2
−2
−1
01
2
Theoretical Quantiles
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
Normal Q−Q
3144
26
60 80 100 120 140 160
0.0
0.5
1.0
1.5
Fitted values
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
Scale−Location3144 26
28
0 20 40 60 80 100
0.00
0.02
0.04
Obs. number
Coo
k's
dist
ance
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
Cook's distance
31 51
44
0.00 0.05 0.10 0.15 0.20
−2
−1
01
2
Leverage
Sta
ndar
dize
d re
sidu
als
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
Cook's distance
Residuals vs Leverage
31
51
44
29
0.00
0.02
0.04
Leverage hii
Coo
k's
dist
ance
0 0.05 0.1 0.15 0.2
lm(maxO3 ~ T12 + Ne9 + Ne12 + Ne15 + Vx9 + Vx12 + Vx15 + maxO3v + pluie + v ...
0
0.511.522.5
Cook's dist vs Leverage hii (1 − hii)31 51
44
Sélection de variables
library(leaps)choix <- regsubsets(maxO3~T12+Ne9+Ne12+Ne15+Vx9+Vx12+Vx15+maxO3v+pluie+vent,data=ozone5,nbest=1,nvmax=11)plot(choix,scale="bic")
bic
(Int
erce
pt)
T12
Ne9
Ne1
2
Ne1
5
Vx9
Vx1
2
Vx1
5
max
O3v
plui
eSec
vent
Nor
d
vent
Oue
st
vent
Sud
−100−140−150−150−160−160−160−170−170−170−180
30
summary(choix)$cp
## [1] 123.184393 43.565883 5.419703 1.468441 1.687265 2.744714## [7] 3.875789 5.455315 7.198315 9.019472 11.009827
summary(choix)$which[4,]
## (Intercept) T12 Ne9 Ne12 Ne15 Vx9## TRUE TRUE TRUE FALSE FALSE FALSE## Vx12 Vx15 maxO3v pluieSec ventNord ventOuest## FALSE FALSE TRUE FALSE FALSE TRUE## ventSud## FALSE
on garde donc les variables T12, Ne9,maxO3v,vent
31