minera˘c~ao de dados: o que fazer?albertini/1sem2017/md/aulas/02preproc.pdf · 2017-04-13 ·...
TRANSCRIPT
Mineracao de Dados: o que fazer?
1. Decidir o que voce quer fazer: tarefas de mineracao
2. Descrever as tarefas em forma de um programa de computador
3. Executar o programa e apresentar o resultado
Packages para analise de dados: pre-processamento
I Visualizacao de dadosI ggplot2, googleVis, rworldmap
I Manipulacao de dadosI dplyr, data.table
I Imputacao de dados faltantesI MissForest, MissMDA, Mice, MI, Imputation
I Deteccao de outliersI outliers, robust, psych
I Selecao de caracterısticasI fscaret, RRF
I Reducao de dimensionalidadeI FactoMineR, dimRed
I Chickwts: estudo sobre crescimento de frango de acordo com osuplemento alimentar
data(chickwts)
plot(chickwts$feed)
casein horsebean linseed meatmeal soybean sunflower
02
46
810
1214
data(chickwts)
feeds <- table(chickwts$feed)# obter frequencias
feeds
##
## casein horsebean linseed meatmeal soybean
## 12 10 12 11 14
## sunflower
## 12
barplot(feeds)
casein horsebean linseed meatmeal soybean sunflower
02
46
810
1214
barplot(feeds[order(feeds, decreasing=TRUE)])
soybean casein linseed sunflower meatmeal horsebean
02
46
810
1214
barplot(feeds[order(feeds)], horiz=TRUE, las=1,
col=topo.colors(length(feeds)), border=NA,
main="Animais alimentados \npor suplemento (chickwts)",
xlab="Numero de frangos")
horsebean
meatmeal
casein
linseed
sunflower
soybean
Animais alimentados por suplemento (chickwts)
Número de frangos
0 2 4 6 8 10 12 14
feeds <- table(chickwts$feed)
pie(feeds)
casein
horsebean
linseed
meatmeal
soybean
sunflower
pie(feeds[order(feeds, decreasing=TRUE)],
init.angle=90,
clockwise=TRUE,
col = c("seashell", "cadetblue2", "lightpink",
"lightcyan", "plum1", "papayawhip"),
main = "Uso de cada suplemento (chickwts)")
soybean
casein
linseed
sunflower
meatmeal
horsebean
Uso de cada suplemento (chickwts)
data(lynx) # captura de felinos Lynx 1821 - 1934
hist(lynx)
Histogram of lynx
lynx
Fre
quen
cy
0 1000 2000 3000 4000 5000 6000 7000
010
2030
4050
60
data(lynx) # captura de felinos Lynx 1821 - 1934
h <- hist(lynx, breaks=11, freq=FALSE,
col = "thistle1",
main = "Captura anual de linces no Canada\n1821 - 1934",
xlab = "Numero de linces capturados")
curve(dnorm(x, mean=mean(lynx), sd=sd(lynx)),
col = "thistle4",
lwd = 2,
add = TRUE)
Captura anual de linces no Canadá1821 − 1934
Número de linces capturados
Den
sity
0 1000 2000 3000 4000 5000 6000 7000
0e+
002e
−04
4e−
046e
−04
data(USJudgeRatings) # avaliac~oes de juızes
boxplot(USJudgeRatings$RTEN)
●
●
●●
56
78
9
data(USJudgeRatings) # avaliacoes de juizes
boxplot(USJudgeRatings,
horizontal = TRUE, las=1, notch = TRUE,
col = "slategray3", boxwex = 0.5, whisklty=1,
outpch=16, outcol="slategray3", stoplelty = 0,
ylim = c(0,10), xlab = "Avaliac~oes")
●●
●
●
●
●●
●
●●
●
● ●● ●
● ● ●●
CONT
INTG
DMNR
DILG
CFMG
DECI
PREP
FAMI
ORAL
WRIT
PHYS
RTEN
0 2 4 6 8 10
Avaliações
data(swiss) # 1888 sobre fertilidade e economia suıca
fertility <- swiss$Fertility
hist(fertility, prob = TRUE, ylim = c(0, 0.04),
xlim = c(30, 100), breaks =11, col = "gray",
main = "Fertilidade nas 47 provincias suıcas")
Fertilidade nas 47 provincias suíças
fertility
Den
sity
30 40 50 60 70 80 90 100
0.00
0.01
0.02
0.03
0.04
curve(dnorm(x, mean=mean(fertility), sd =sd(fertility)),
col="red", lwd=3, add=TRUE)
lines(density(fertility), col="blue")
lines(density(fertility, adjust=3), col="darkgreen")
rug(fertility, col="red") # plot de linhas sob hist
Fertilidade nas 47 províncias suíças
fertility
Den
sity
30 40 50 60 70 80 90 100
0.00
0.01
0.02
0.03
0.04
data(iris) ; pl = iris$Petal.Length
hist(pl, prob = TRUE, col = "gray",
main = "Comprimento de petalas de flores Iris")
curve(dnorm(x, mean=mean(pl), sd =sd(pl)),
col="red", lwd=3, add=TRUE)
lines(density(pl), lwd=4,col="darkgreen")
rug(pl, col="red") # plot de linhas sob hist
Comprimento de pétalas de flores Iris
pl
Den
sity
1 2 3 4 5 6 7
0.0
0.1
0.2
0.3
0.4
0.5
Visualizacao de dados: ggplot2
data(mpg) # hwy: mpg consumo de carros, drv: tipo
qplot(displ, hwy, data = mpg, color=drv)
●●
●
●
●●
●
●
●
●
●
●● ●●
●
●
●
●
●
●
● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
● ●
●●
●●
●
●
●
● ●
●
●
●●
●●
●
●
●
● ●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
● ●
●
●
●
●
● ●
●●●
●●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●●
●
●
●
●
●
●●
●
●
●
●
●
●●●
●
●
●
●
●
●
●
●
●
● ●
●
●
●
●
●
● ●
●
●
●
●
●
●
●●
● ●
●●
●
●
● ●
●
●
●●
●
●
●
●
●
●● ●●
●
●
●
●
●●
●
●
●
●
●
●
●●
●●
●
●
●
●●
●●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●●
●
●
●
●
●● ●●
●
●
●
●
●
●
●
●●●
●
●
●● ●
20
30
40
2 3 4 5 6 7
displ
hwy
drv●
●
●
4
f
r
Visualizacao de dados: ggplot2
qplot(displ, hwy, data = mpg,
geom=c('point', #mantem os pontos
'smooth'))#tendencia dos pontos - area cinza
●●
●
●
●●
●
●
●
●
●
●● ●●
●
●
●
●
●
●
● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
● ●
●●
●●
●
●
●
● ●
●
●
●●
●●
●
●
●
● ●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
● ●
●
●
●
●
● ●
●●●
●●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●●
●
●
●
●
●
●●
●
●
●
●
●
●●●
●
●
●
●
●
●
●
●
●
● ●
●
●
●
●
●
● ●
●
●
●
●
●
●
●●
● ●
●●
●
●
● ●
●
●
●●
●
●
●
●
●
●● ●●
●
●
●
●
●●
●
●
●
●
●
●
●●
●●
●
●
●
●●
●●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●●
●
●
●
●
●● ●●
●
●
●
●
●
●
●
●●●
●
●
●● ●
20
30
40
2 3 4 5 6 7
displ
hwy
Visualizacao de dados: ggplot2
qplot(hwy, data = mpg, fill=drv)
0
10
20
30
40
10 20 30 40
hwy
coun
t
drv
4
f
r
Visualizacao de dados: ggplot2
qplot(displ, data = mpg, facets=.~drv, col=cyl)
4 f r
2 4 6 2 4 6 2 4 6
0
5
10
15
20
displ
coun
t
Visualizacao de dados: ggplot2
qplot(hwy, data = mpg, facets=drv~.,binwidth=2)
4f
r
10 20 30 40
0
10
20
30
0
10
20
30
0
10
20
30
hwy
coun
t
qplot(Sepal.Length, Petal.Length, data = iris, col=Species,
size = Petal.Width, alpha = I(0.7),
xlab = 'Sepal Length', ylab = 'Petal Length',
main = "Sepal vs. Petal Length in Fisher's Iris data")
2
4
6
5 6 7 8
Sepal Length
Pet
al L
engt
h
Petal.Width
0.5
1.0
1.5
2.0
2.5
Species
setosa
versicolor
virginica
Sepal vs. Petal Length in Fisher's Iris data
qplot(age, circumference, data = Orange,
geom = c('point','line'), size=circumference,
colour = Tree,xlab='Idade',ylab='Circunferencia',
main = 'Circunferencia da laranjeira vs. idade')
●
●
●
● ●
● ●
●
●
●
●●
● ●
●
●
●
●●
● ●
●
●
●
●●
● ●
●
●
●
●
●
● ●
50
100
150
200
400 800 1200 1600
Idade
Circ
unfe
rênc
ia
circumference
●
●
●●
50
100
150
200
Tree●
●
●
●
●
3
1
5
2
4
Circunferência da laranjeira vs. idade
#displ: engine displacement (L)
# hwy: highway miles per gallon
ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point() +
geom_smooth(se = FALSE, method = 'lm')
●●
●●
●●●
●●
●●
●● ●●●
●
●
●
●
●
● ●
●
●
●●
●
●
●●
●
●
●
●
●
●
● ●
●●
●●
●
●●
● ●
●●
●●
●●
●
●
●
● ●
●
●●
●
●●
●
●●●
●
●●
●
●
● ●●
●
●
●
● ●
●●●●●●
●
●
●●
●
●
●●●●
●
●●●
●
●
●
●●
●
●●
●●
●●
●
●
●●●
●●●
●
●●
●
●
●●
●●
● ●
●
●●
●●
● ●
●
●
●
●●
●●●
● ●
●●
●
●
● ●●●
●●●
●
●●
●
●● ●●●
●
●
●
●●●
●
●
●
●
●
●●
●●
●●
●
●●
●●●
●
●
●
●
●
●
●
●●
●
●
●●
●
●
●
●●
●
●
●
●
●● ●●
●●
●
●
●
●
●●●●
●●
●● ●
20
30
40
2 3 4 5 6 7
displ
hwy
class●
●
●
●
●
●
●
2seater
compact
midsize
minivan
pickup
subcompact
suv
Visualizacao de dados: googleVis
install.packages(googleVis)
demo(googleVis)
Visualizacao de dados: mapas com rworldmap
plot(getMap())
points(airports$lon, airports$lat,col='red',pch=17,cex=.1)
lim = airports[airports$IATA_FAA=="LIM",c('lon','lat')]
udi = airports[airports$IATA_FAA=="UDI",c('lon','lat')]
lines(c(lim$lon, udi$lon),c(lim$lat, udi$lat), col='blue')
Manipulacao de dados: dplyr
## require(dplyr)
data(mtcars); mtcars = tbl_df(mtcars)
mtcars %>% group_by(cyl, am) %>%
select(mpg, cyl, wt, am) %>%
summarise(avgmpg = mean(mpg), avgwt = mean(wt)) %>%
filter(avgmpg > 20)
## Source: local data frame [3 x 4]
## Groups: cyl [2]
##
## cyl am avgmpg avgwt
## <dbl> <dbl> <dbl> <dbl>
## 1 4 0 22.90000 2.93500
## 2 4 1 28.07500 2.04225
## 3 6 1 20.56667 2.75500
Manipulacao de dados: dplyr
require(nycflights13)#voos partindo de NY em 2013
data(flights)
glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 201...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ dep_time <int> 517, 533, 542, 544, 554, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 55...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3...
## $ arr_time <int> 830, 850, 923, 1004, 812, 7...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 7...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 1...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL...
## $ flight <int> 1545, 1714, 1141, 725, 461,...
## $ tailnum <chr> "N14228", "N24211", "N619AA...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK",...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN",...
## $ air_time <dbl> 227, 227, 160, 183, 116, 15...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762...
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, ...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-...
Manipulacao de dados: dplyr
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## # ... with 13 more variables: arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Manipulacao de dados: dplyr
I Verbos: operadores de dadosI filter() e slice()I arrange()I select() e rename()I distinct()I mutate() e transmute()I summarise()I sample n() e sample frac()
Manipulacao de dados: dplyrI filter(): seleciona linhas
# flights[flights$month == 1 & flights$day == 1, ]
filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # ... with 832 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Manipulacao de dados: dplyrI filter(): seleciona linhas de acordo com valores
# voos de janeiro OU | de fevereiro
filter(flights, month == 1 | month == 2)
## # A tibble: 51,955 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # ... with 51,945 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Manipulacao de dados: dplyr
# selecionar linhas que origem contem 'A'
glimpse(filter(flights, grepl('A', origin)))
## Observations: 104,662
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 201...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ dep_time <int> 533, 554, 557, 558, 559, 60...
## $ sched_dep_time <int> 529, 600, 600, 600, 600, 60...
## $ dep_delay <dbl> 4, -6, -3, -2, -1, 0, 0, -8...
## $ arr_time <int> 850, 812, 709, 753, 941, 85...
## $ sched_arr_time <int> 830, 837, 723, 745, 910, 85...
## $ arr_delay <dbl> 20, -25, -14, 8, 31, -7, 12...
## $ carrier <chr> "UA", "DL", "EV", "AA", "AA...
## $ flight <int> 1714, 461, 5708, 301, 707, ...
## $ tailnum <chr> "N24211", "N668DN", "N829AS...
## $ origin <chr> "LGA", "LGA", "LGA", "LGA",...
## $ dest <chr> "IAH", "ATL", "IAD", "ORD",...
## $ air_time <dbl> 227, 116, 53, 138, 257, 152...
## $ distance <dbl> 1416, 762, 229, 733, 1389, ...
## $ hour <dbl> 5, 6, 6, 6, 6, 6, 6, 6, 6, ...
## $ minute <dbl> 29, 0, 0, 0, 0, 0, 0, 10, 5...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-...
Manipulacao de dados: dplyr
# selecionar linhas que origem contem 'A'
flights %>% select(carrier:dest) %>%
filter(grepl('A', origin)) %>%
glimpse()
## Observations: 104,662
## Variables: 5
## $ carrier <chr> "UA", "DL", "EV", "AA", "AA", "B6"...
## $ flight <int> 1714, 461, 5708, 301, 707, 371, 46...
## $ tailnum <chr> "N24211", "N668DN", "N829AS", "N3A...
## $ origin <chr> "LGA", "LGA", "LGA", "LGA", "LGA",...
## $ dest <chr> "IAH", "ATL", "IAD", "ORD", "DFW",...
Manipulacao de dados: dplyrI slice(): seleciona linhas por posicao
# flights[1:7, ]
slice(flights, 1:7)
## # A tibble: 7 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## # ... with 13 more variables: arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Manipulacao de dados: dplyrI arrange(): reordena linhas por campo
arrange(flights, year, month, day)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # ... with 336,766 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Manipulacao de dados: dplyrI arrange(): reordena linhas por campo
# ordena decrescente de acordo com atraso de chegada
#flights[order(flights$arr_delay, decreasing = TRUE), ]
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 9 641 900 1301
## 2 2013 6 15 1432 1935 1137
## 3 2013 1 10 1121 1635 1126
## 4 2013 9 20 1139 1845 1014
## 5 2013 7 22 845 1600 1005
## 6 2013 4 10 1100 1900 960
## 7 2013 3 17 2321 810 911
## 8 2013 7 22 2257 759 898
## 9 2013 12 5 756 1700 896
## 10 2013 5 3 1133 2055 878
## # ... with 336,766 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Manipulacao de dados: dplyrI select(): seleciona colunas por nomes
select(flights, month, day, arr_delay)
## # A tibble: 336,776 × 3
## month day arr_delay
## <int> <int> <dbl>
## 1 1 1 11
## 2 1 1 20
## 3 1 1 33
## 4 1 1 -18
## 5 1 1 -25
## 6 1 1 12
## 7 1 1 19
## 8 1 1 -14
## 9 1 1 -8
## 10 1 1 8
## # ... with 336,766 more rows
Manipulacao de dados: dplyr
arrange(select(flights, month, day, arr_delay),
desc(arr_delay))
## # A tibble: 336,776 × 3
## month day arr_delay
## <int> <int> <dbl>
## 1 1 9 1272
## 2 6 15 1127
## 3 1 10 1109
## 4 9 20 1007
## 5 7 22 989
## 6 4 10 931
## 7 3 17 915
## 8 7 22 895
## 9 12 5 878
## 10 5 3 875
## # ... with 336,766 more rows
Manipulacao de dados: dplyr
# usar somente as coluna de year ate day (ambas inclusas)
select(flights, year:day)
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
Manipulacao de dados: dplyr
# usar todas as colunas EXCETO de year ate day
select(flights, -(year:day))
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time
## <int> <int> <dbl> <int>
## 1 517 515 2 830
## 2 533 529 4 850
## 3 542 540 2 923
## 4 544 545 -1 1004
## 5 554 600 -6 812
## 6 554 558 -4 740
## 7 555 600 -5 913
## 8 557 600 -3 709
## 9 557 600 -3 838
## 10 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables:
## # sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Manipulacao de dados: dplyr
# renomear coluna, mas fica somente com ela
select(flights, chassiNum = tailnum)
## # A tibble: 336,776 × 1
## chassiNum
## <chr>
## 1 N14228
## 2 N24211
## 3 N619AA
## 4 N804JB
## 5 N668DN
## 6 N39463
## 7 N516JB
## 8 N829AS
## 9 N593JB
## 10 N3ALAA
## # ... with 336,766 more rows
Manipulacao de dados: dplyr
# renomear coluna E manter outras
rename(flights, chassiNum = tailnum)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 2
## 2 2013 1 1 533 529 4
## 3 2013 1 1 542 540 2
## 4 2013 1 1 544 545 -1
## 5 2013 1 1 554 600 -6
## 6 2013 1 1 554 558 -4
## 7 2013 1 1 555 600 -5
## 8 2013 1 1 557 600 -3
## 9 2013 1 1 557 600 -3
## 10 2013 1 1 558 600 -2
## # ... with 336,766 more rows, and 13 more variables:
## # arr_time <int>, sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>,
## # chassiNum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Manipulacao de dados: dplyr
distinct(flights, tailnum)
## # A tibble: 4,044 × 1
## tailnum
## <chr>
## 1 N14228
## 2 N24211
## 3 N619AA
## 4 N804JB
## 5 N668DN
## 6 N39463
## 7 N516JB
## 8 N829AS
## 9 N593JB
## 10 N3ALAA
## # ... with 4,034 more rows
Manipulacao de dados: dplyr
distinct(flights, origin, dest)
## # A tibble: 224 × 2
## origin dest
## <chr> <chr>
## 1 EWR IAH
## 2 LGA IAH
## 3 JFK MIA
## 4 JFK BQN
## 5 LGA ATL
## 6 EWR ORD
## 7 EWR FLL
## 8 LGA IAD
## 9 JFK MCO
## 10 LGA ORD
## # ... with 214 more rows
Manipulacao de dados: dplyrI mutate(): adiciona coluna (recodificacao de variaveis)
mutate(flights, # gain e speed s~ao recodificac~oes
gain = arr_delay - dep_delay,
speed = distance / air_time * 60) %>%
select(tailnum, gain, speed)
## # A tibble: 336,776 × 3
## tailnum gain speed
## <chr> <dbl> <dbl>
## 1 N14228 9 370.0441
## 2 N24211 16 374.2731
## 3 N619AA 31 408.3750
## 4 N804JB -17 516.7213
## 5 N668DN -19 394.1379
## 6 N39463 16 287.6000
## 7 N516JB 24 404.4304
## 8 N829AS -11 259.2453
## 9 N593JB -5 404.5714
## 10 N3ALAA 10 318.6957
## # ... with 336,766 more rows
Manipulacao de dados: dplyr
# resumir tabela de acordo com origem do voo
flights %>%
group_by(origin) %>%
summarize(n=n())
## # A tibble: 3 × 2
## origin n
## <chr> <int>
## 1 EWR 120835
## 2 JFK 111279
## 3 LGA 104662
Manipulacao de dados: dplyr
# resumir tabela de acordo com origem e destino do voo
flights %>%
group_by(origin,dest) %>%
summarize(n=n()) %>%
arrange(desc(n))
## Source: local data frame [224 x 3]
## Groups: origin [3]
##
## origin dest n
## <chr> <chr> <int>
## 1 JFK LAX 11262
## 2 LGA ATL 10263
## 3 LGA ORD 8857
## 4 JFK SFO 8204
## 5 LGA CLT 6168
## 6 EWR ORD 6100
## 7 JFK BOS 5898
## 8 LGA MIA 5781
## 9 JFK MCO 5464
## 10 EWR BOS 5327
## # ... with 214 more rows
Manipulacao de dados: dplyr
# Organizar dados para obter atraso medio
# por distancia viajada
by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum, count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)
Manipulacao de dados: dplyr
ggplot(delay, aes(dist, delay)) +
geom_point(aes(size = count), alpha = 1/2) +
geom_smooth() + scale_size_area()
−20
0
20
40
60
500 1000 1500 2000
dist
dela
y
count
500
1000
1500
2000
2500
Manipulacao de dados: dplyr
I Conexao a bancos de dados usando dplyr
I Mais informacoes em [link]
my_db <- src_sqlite('my_db.sqlite3')
flights_tbl <- tbl(my_db, 'hflights') # nome da table
## possivel usar qualquer comando dplyr em flights_tbl
## possivel usar SQL
tbl(my_db, sql('SELECT * FROM hflights LIMIT 100'))
flights %>% group_by(origin, dest) %>% explain()
Manipulacao de dados: data.tableI Package data.table e util para grandes datasets
require(data.table)
require(hflights)
DT <- as.data.table(hflights)# voos de Houston, 2011
DT[Month==10,mean(na.omit(AirTime)), by=UniqueCarrier]
## UniqueCarrier V1
## 1: AA 68.76471
## 2: AS 255.29032
## 3: B6 176.93548
## 4: CO 141.52861
## 5: DL 92.76824
## 6: WN 87.14947
## 7: XE 82.44422
## 8: OO 114.98865
## 9: UA 166.18354
## 10: US 137.46078
## 11: EV 113.12273
## 12: F9 126.55357
## 13: FL 90.85561
## 14: MQ 100.13054
Imputacao de dados faltantes
I PacotesI Missing Data Imputation and Model Checking: miI Imputation: imputationI Multivariate Imputation by Chained Equations: miceI ... varios outros
I Dados pode faltar ...I ... aleatoriamenteI ... por alguma razao nao-obvia
Porque imputacao? Problemas com dados faltantes
require(Hmisc)
x1 <- c(1,2,3,NA,5)
describe(x1)
## x1
## n missing distinct Info Mean Gmd
## 4 1 4 1 2.75 2.167
##
## Value 1 2 3 5
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
mean(x1) # PROBLEMA!
## [1] NA
Evitando dados faltantes
x1 <- c(1,2,3,NA,5)
mean(x1, na.rm = T)
## [1] 2.75
x2 <- x1[is.na(x1)] # mas... reduz numero de observacoes
x3 <- x1
x3[is.na(x2)] <- 0 # mas... pode ser tambem a media
x4 <- ifelse(is.na(x1), 0 ,x1)
## Ou USAR pacotes de IMPUTAC~AO
Imputacao de dados faltantes: mice
require(mice)
ar <- airquality
ar[4:10,3] <- rep(NA,7)
ar[1:5,4] <- NA
md.pattern(ar)#visualizar NA's nos dados
## Month Day Temp Solar.R Wind Ozone
## 104 1 1 1 1 1 1 0
## 34 1 1 1 1 1 0 1
## 4 1 1 1 0 1 1 1
## 3 1 1 1 1 0 1 1
## 3 1 1 0 1 1 1 1
## 1 1 1 1 0 1 0 2
## 1 1 1 1 1 0 0 2
## 1 1 1 1 0 0 1 2
## 1 1 1 0 1 0 1 2
## 1 1 1 0 0 0 0 4
## 0 0 5 7 7 37 56
Imputacao de dados faltantes: transformacao de variaveis
I Predictive mean matchingI statisticalhorizons.com/predictive-mean-matching
## require(mice)
#method: mean=medias, pmm=predictive mean matching
ar_imp <- mice(ar,m=5,maxit=50,
method='pmm',seed=500, printFlag=F)
ar <- complete(ar_imp)
md.pattern(ar)
## Ozone Solar.R Wind Temp Month Day
## [1,] 1 1 1 1 1 1 0
## [2,] 0 0 0 0 0 0 0
Imputacao de dados faltantes: dados multivariadosI Observacoes mais proximas: k-vizinhos mais proximos
## require(VIM) # para usar kNN()
ar <- airquality
ar[4:10,3] <- rep(NA,7)
ar[1:5,4] <- NA
glimpse(kNN(ar,var=c('Ozone','Solar.R','Wind'),k=5))
## Observations: 153
## Variables: 9
## $ Ozone <int> 41, 36, 12, 18, 23, 28, 23, 19...
## $ Solar.R <int> 190, 118, 149, 313, 194, 194, ...
## $ Wind <dbl> 7.4, 8.0, 12.6, 8.6, 8.6, 9.7,...
## $ Temp <int> NA, NA, NA, NA, NA, 66, 65, 59...
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...
## $ Ozone_imp <lgl> FALSE, FALSE, FALSE, FALSE, TR...
## $ Solar.R_imp <lgl> FALSE, FALSE, FALSE, FALSE, TR...
## $ Wind_imp <lgl> FALSE, FALSE, FALSE, TRUE, TRU...
Deteccao de outliers
Definicao de outlier (dados espurios)
“Um outlier e uma observacao tao destoante de outras que torna-sesuspeita de ter sido gerada de uma forma distinta”(Hawkins, 1980)
I Outliers refletem:I erros de afericaoI falhas de execucaoI variabilidade intrınsica
Exemplos de outliers
I Caso Hadlum vs. Hadlum (1948): caso de divorcio por causa deuma gravidez de 50 semanas
I Temperaturas: 21h: 21C, 22h: 22C, 23h: 23C, 24h: 38C, 01h:38C,02h: 21C
I Registro de ganho de peso: 1.2kg, 1.6kg, 1.9kg, 1.55kg, 2.2kg,2.25kg
I 10 dados foram lancados e o numero de ’6’, registrado: 2, 0, 3, 12,2,0 ,1 ,1,3
Tratamento de outliers
I “Outliers, para uns, sao indesejaveis, para outros a fonte dainformacao mais importante”
I The Study of Outliers: Purpose and Model, Barnett (1978):tratamento
1. Identificacao (Deteccao)2. Incorporacao – mudanca do modelo do fenomeno3. Rejeicao4. Acomodacao
Identificacao de outliers
data(rivers)
boxplot(rivers, horizontal=TRUE, col='slategray')
rug(rivers, col='red',ticksize=.1, lwd=.5)
●●● ● ●● ●●● ●●
0 500 1000 1500 2000 2500 3000 3500
Rejeicao de outliers
rivers.limpo <- rivers
outliers <- NA
while (length(outliers) != 0) {outliers <- boxplot.stats(rivers.limpo)$out
manter <- !(rivers.limpo %in% outliers)
rivers.limpo <- rivers.limpo[manter]
}boxplot(rivers.limpo, horizontal=TRUE, col='slategray')
rug(rivers, col='red',ticksize=.1, lwd=.5)
200 400 600 800 1000
Identificacao de outliers: pacote outliersI Funcoes:
I Busca por valor com maior diferenca a media: outlier()I Testa e remove outlier encontrado: rm.outlier()
require(outliers)
set.seed(1234); y = rnorm(8); y
## [1] -1.2070657 0.2774292 1.0844412 -2.3456977
## [5] 0.4291247 0.5060559 -0.5747400 -0.5466319
outlier(y)
## [1] -2.345698
rm.outlier(y)
## [1] -1.2070657 0.2774292 1.0844412 0.4291247
## [5] 0.5060559 -0.5747400 -0.5466319
Acomodacao de outliers: estatısticas robustasI Outliers sao dados com erros ou extremosI Outliers afetam medidas-resumo
area <- state.area
summary(area)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1214 37320 56220 72370 83230 589800
mean(area)
## [1] 72367.98
median(area)
## [1] 56222
mean(area, trim=0.05) #desconsidera 10% de outliers
## [1] 59957.22
Acomodacao de outliers: pacote robust – Estatısticasrobustas
## require(robust)
sd(area) #desvio padr~ao
## [1] 88278.01
mad(area) #desvio absoluto mediano
## [1] 35711.39
IQR(area) #intervalo interquartile
## [1] 45916.75
fivenum(area) #boxplot
## [1] 1214 36291 56222 83557 589757
Outliers: transformacao de variaveis
data(islands)
# problema: tudo e outlier?
boxplot(islands, horizontal=TRUE)
●● ●● ●● ●●
0 5000 10000 15000
Outliers: transformacao de variaveis
islands.z <- scale(islands)
attr(islands.z, 'scaled:center') # media original
## [1] 1252.729
attr(islands.z, 'scaled:scale') # desv. pad. original
## [1] 3371.146
summary(islands.z)
## V1
## Min. :-0.3680
## 1st Qu.:-0.3655
## Median :-0.3594
## Mean : 0.0000
## 3rd Qu.:-0.3172
## Max. : 4.6676
Outliers: transformacao de variaveis
islands.ln <- log(islands)
summary(islands.ln)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.485 3.020 3.713 4.446 5.211 9.740
boxplot(islands.ln, horizontal=TRUE)
rug(islands.ln, col='red',ticksize=.1, lwd=.5)
●● ●●●
4 6 8 10
Outliers: transformacao de variaveis
continentes <- ifelse(islands > 1000, TRUE, FALSE)
# valores sao transformados em ordem
islands[continentes]
## Africa Antarctica Asia
## 11506 5500 16988
## Australia Europe North America
## 2968 3745 9390
## South America
## 6795
Outliers: winsorizing
I Metodo de substituicao de valores extremos por percentis altos (0.2e o padrao)
require(psych)
data <- c(sample(x=1:10, size=20, replace=TRUE), 13,17)
data
## [1] 3 3 2 3 4 4 2 1 3 9 6 10 9 1 5 3 4
## [18] 6 2 8 13 17
winsor(data,trim=0.1)
## [1] 3.0 3.0 2.0 3.0 4.0 4.0 2.0 2.0 3.0 9.0 6.0 9.9
## [13] 9.0 2.0 5.0 3.0 4.0 6.0 2.0 8.0 9.9 9.9
Selecao de caracterısticas
I Principais abordagens:I Eliminar caracterısticas redundantesI Eliminar caracterısticas com variancia zeroI Buscar pelo subconjunto mınimo de caracterısticas para obter o
melhor resultado de classificacaoI Busca pelo conjunto maximo que algum tipo de informacao relevante
Selecao de caracterısticas: correlacoes
featurePlot(x = iris[, 1:4], y = iris$Species, #caret
plot = "pairs", auto.key = list(columns = 3))
Scatter Plot Matrix
Sepal.Length78
7 8
56
5 6 ●●●●●
●
●●
●●
●
●●●
● ●●
●
●
●●
●●
●●● ●●●
●●
● ●●
●●●
●●
●●● ●
● ●●
●●
●●
●
●●
●
●
●
●
●
●
●●
●● ●●
●
●●●
●●●● ●
●●●●
●●●●●●
●
●
●●
●●●
●●
●
● ●●●
●
●
●●
●
●●
●
●
●
●●
●●●
● ●
●●
●●
●
●
●
●
●●
●
●●●
●●●
●●●
●
●●●
●●●
●
●●●● ●
●●
●●●●●●
●●
●●●
●●●
●●●●
●
●●●
●●●●●
●●●●
●●●
●●●
●●
●●●●
●●●●●
●●
●
●●
●
●
●
●
●
●
●●
●●●●
●
●●●
●●● ●●
●●●●
●●●●● ●
●
●
●●
●●●
●●
●
●●●●
●
●
●●
●
●●
●
●
●
●●
●●●
●●
●●
●●
●
●
●
●
●●●
●●●
●●●
●●●
●
●●●
●●●
●
●●●●●●●
●●●●●
●
●●
●●●
●●●
●●●
●
●
●●
●●
●●●●●●●●
●●●
●●●
●●
●●●●
●●●
●●
●●
●
●●
●
●
●
●
●
●
●●
●● ●●
●
●●●
●●●●●
●●● ●
●●●●
● ●
●
●
●●
●●●
●●
●
●●●●
●
●
●●
●
● ●
●
●
●
●●
●●●
● ●
●●
●●
●
●
●
●
●●
●
●●●
● ●●
●●●
●
●●●
● ●●
●
●●●●● ●
●
●
●●●
●●
●●
●●
●●
●●
●
●
●
●●●
●●●●●
●
●●●●●
●
●●
●●●●
●
●●
●
●●●
●
●
●
●
● ●● ●
●
●●
●
●
●●
●
●
●
●●●●
●
●●
●
●●
●●●●●●
●●●
●●●
●●
●
●
●●
●
●●
●●● ●
●●
●
●●●● ●
●
●
●
●
●
●●
●●
●●
●
●
●
●
● ●●
● ●
●●
●●●
●
●●●
●
●●● ●●●
●
●●●
●
●
●
● Sepal.Width3.54.04.5
3.5 4.5
2.02.53.0
2.0 3.0
●
●●●
●●
●●
●●
●●
●●
●
●
●
●●●
●●●●●
●
●●●●●●
●●
●●●●
●
●●
●
●●●
●
●
●
●
● ●●●
●
●●
●
●
●●
●
●
●
●●●●
●
●●
●
●●●●●●●●
●●●● ●
●
●●
●
●
●●
●
●●
●●●●
●●
●
●●●● ●
●
●
●
●
●
●●
●●
●●
●
●
●
●
● ●●
●●
●●
●●●
●
●●●
●
●●●●●●
●
●●●
●
●
●
●
●
●●●
●●
●●
●●
●●
●●
●
●
●
●●●
●●●●●
●
●●●●●
●
●●
●●●●
●
●●
●
●●
●
●
●
●
●
● ●●●
●
●●
●
●
●●
●
●
●
●●●●
●
●●
●
●●
●●●●
●●●●●● ●
●
●●
●
●
●●
●
●●
●●●●
●●
●
●●● ●●
●
●
●
●
●
●●
●●
●●
●
●
●
●
●●●
●●
●●
●●
●
●
●●●
●
●●● ● ●●
●
●●●
●
●
●
●
●●●●●●●●● ● ●●●● ●●●● ●●●●●
●●●●●●●● ●●●●● ●●● ●●●● ●●●●● ●●
●●●
●●● ●
●
●●
●●●●
●●●
●●
●●●●●●●
●●●
●●●●
●● ● ●●●●● ●
●●
●●● ●
●
●
●●
●●●●
●
●● ●
●●●
●● ●●
●●
●●
●
●
●● ●
●●● ●● ●●●
●●
●●●
●●●●●●●●●●●
●●●● ● ●●●●● ●●●● ● ●●● ●●● ●●
●●● ●●●●● ● ●●●● ●●● ●●● ● ● ●● ●● ●●
●●●
●●● ●
●
●●
●●●
●
●●●●
●●
●●
● ●●●
●●●
●●● ●
●● ●●● ●●
● ●●
●● ●●●
●
●
●●
●●●●
●
●● ●
●● ●● ● ●●
●●
●●
●
●
●●●
●●●●● ●●●
●●
●●●●●●●●●
●● ● ●●
Petal.Length4567
4 5 6 7
1234
1 2 3 4●●●●●
●●●●●●●●●●●●●
●●●●●
●●●●●●●●●●●●●●●●●●●●●●
●●●●●
●●●
●●● ●
●
●●
●●●
●
●●●●●
●●
●●●
●●● ●●
●●●●
●●●●●●●
●●●
●●●●●
●
●
●●
●● ●●
●
●● ●
●●●
● ●●●
●●
●●
●
●
●●●
●●●● ●●●
●●
●●●
●● ●
●●●●●●●●●
●●●●●●●●● ● ●●●● ●
●●● ●●●●
●●
●●●●●●●●
●●●● ●●● ●●●●
●●●●● ●●
●● ●●
●●
●
●●●
●
●
●
●● ●●
●
●
●
●
●●
●●●●●
●
●●●●
●● ● ●●●●●
●●
●●●● ●
●●
●
●●
●
● ●
● ●●
●
●●●●
● ●
●
●●
●
●● ●
●●
●●●●
●● ●
●
●●
●●
●●●
●●
●
●●●
●●●
●
●●●● ●●●●●● ●●●● ●
●●● ●●●●
●●●●●●●●●
●●●●● ●●● ●●● ●
●●● ●● ●●
●●●●●●
●
●●●
●
●
●
●●●●
●
●
●
●
●●
●●●●●
●
●●●●
● ● ●●● ●●●
●●
●● ●●●●●
●
●●
●
●●
● ●●
●
●●●●
● ●
●
●●
●
●●●
●●
●●●●
●● ●●
●●
● ●
●●●●●
●
●●
●
● ●●
●
●●●●●●●●●●●●●●●●●●●●●●
●●●●
●●●●●●●●●●●●●●●●●
●●●●●●●
●●●●
●●●
●●●
●
●
●
●● ●●
●
●
●
●
●●●●●●●
●
●●●●
●●●●●●●●●
●●
●●●●●●
●
●●
●
● ●
● ●●
●
●●●●
●●
●
●●
●
●● ●●
●●●●
●
●●●
●
●●
●●
●●●●●
●
●●
●
●●●
●
Petal.Width1.52.02.5
1.5 2.5
0.00.51.0
0.0 1.0
setosa versicolor virginica● ● ●
Selecao de caracterısticas: correlacoes
data(iris)
cor(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length
## Sepal.Length 1.0000000 -0.1175698 0.8717538
## Sepal.Width -0.1175698 1.0000000 -0.4284401
## Petal.Length 0.8717538 -0.4284401 1.0000000
## Petal.Width 0.8179411 -0.3661259 0.9628654
## Petal.Width
## Sepal.Length 0.8179411
## Sepal.Width -0.3661259
## Petal.Length 0.9628654
## Petal.Width 1.0000000
Selecao de caracterısticas: correlacoes
corr <- cor(iris[,1:4])
altaCor <- sum(abs(corr[upper.tri(corr)]) > .9)
altaCor
## [1] 1
Selecao de caracterısticas: correlacoes
require(caret)
corr <- cor(iris[,1:4])
irisSemAltaRedundancia <-
iris[,-findCorrelation(corr,cutoff=0.95)]
Selecao de caracterısticas: correlacoes
library(earth)
data(etitanic)
head(model.matrix(survived ~ ., data = etitanic))
## (Intercept) pclass2nd pclass3rd sexmale age
## 1 1 0 0 0 29.0000
## 2 1 0 0 1 0.9167
## 3 1 0 0 0 2.0000
## 4 1 0 0 1 30.0000
## 5 1 0 0 0 25.0000
## 6 1 0 0 1 48.0000
## sibsp parch
## 1 0 0
## 2 1 2
## 3 1 2
## 4 1 2
## 5 1 2
## 6 0 0
dummies <- dummyVars(survived ~ ., data = etitanic)
head(predict(dummies, newdata = etitanic))
## pclass.1st pclass.2nd pclass.3rd sex.female sex.male
## 1 1 0 0 1 0
## 2 1 0 0 0 1
## 3 1 0 0 1 0
## 4 1 0 0 0 1
## 5 1 0 0 1 0
## 6 1 0 0 0 1
## age sibsp parch
## 1 29.0000 0 0
## 2 0.9167 1 2
## 3 2.0000 1 2
## 4 30.0000 1 2
## 5 25.0000 1 2
## 6 48.0000 0 0
#https://www.youtube.com/watch?v=igPQ-pI8Bjo&list=WL&index=33 @ 4:12
Selecao de caracterısticas: combinacoes
dados <- matrix(0, nrow=6, ncol=5)
dados[,1] <- c(1, 1, 1, 1, 1, 1)
dados[,2] <- c(1, 1, 1, 0, 0, 0)
dados[,3] <- c(0, 0, 0, 1, 1, 1)
dados[,4] <- c(1, 0, 0, 1, 0, 0)
dados[,5] <- c(0, 0, 1, 0, 0, 1)
comb <- findLinearCombos(dados)
dados[,-comb$remove]
## [,1] [,2] [,3] [,4]
## [1,] 1 1 1 0
## [2,] 1 1 0 0
## [3,] 1 1 0 1
## [4,] 1 0 1 0
## [5,] 1 0 0 0
## [6,] 1 0 0 1
Reducao de dimensionalidade: pacote FactoMineR
require(FactoMineR);
data("decathlon")
res.pca <- PCA(decathlon, # PCA = Principal Component Analysis
quanti.sup = 11:12, # rank:pontos - QUANTItativas
quali.sup = 13)# categoricas
Reducao de dimensionalidade: PCA
plot(res.pca, choix="ind")
●
−10 −5 0 5 10
−4
−2
02
4
Individuals factor map (PCA)
Dim 1 (32.72%)
Dim
2 (
17.3
7%)
●●●
●
●
●
●
●● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●
SEBRLECLAYKARPOV
BERNARD
YURKOV
WARNERSZSIVOCZKY
McMULLENMARTINEAUHERNU
BARRAS
NOOL
BOURGUIGNON
SebrleClay
Karpov
Macey
Warners
Zsivoczky
Hernu
Nool
Bernard
Schwarzl
Pogorelov
SchoenbeckBarras
Smith
AveryanovOjaniemiSmirnovQi
Drews
Parkhomenko
Terek
Gomez
Turi
Lorenzo
Karlivans
Korkizoglou
Uldal
Casarsa
DecastarOlympicG
Reducao de dimensionalidade: PCA
# habillage: as cores seguem a 13a variavel
plot(res.pca, choix="ind",habillage=13)
●
−10 −5 0 5 10
−4
−2
02
4
Individuals factor map (PCA)
Dim 1 (32.72%)
Dim
2 (
17.3
7%)
●●●
●
●
●
●
●● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●
SEBRLECLAYKARPOV
BERNARD
YURKOV
WARNERSZSIVOCZKY
McMULLENMARTINEAUHERNUBARRAS
NOOL
BOURGUIGNON
SebrleClay
Karpov
Macey
Warners
Zsivoczky
HernuNool
Bernard
Schwarzl
Pogorelov
SchoenbeckBarras
Smith
AveryanovOjaniemiSmirnov
Qi
Drews
Parkhomenko
Terek
Gomez
Turi
Lorenzo
Karlivans
Korkizoglou
Uldal
Casarsa
DecastarOlympicG
DecastarOlympicG
Reducao de dimensionalidade: PCA
plot(res.pca, choix="var")
●
−2 −1 0 1 2
−1.
0−
0.5
0.0
0.5
1.0
Variables factor map (PCA)
Dim 1 (32.72%)
Dim
2 (
17.3
7%)
100m
Long.jump
Shot.put
High.jump
400m
110m.hurdleDiscus
Pole.vault
Javeline1500m
Rank Points
Reducao de dimensionalidade: PCA
barplot(res.pca$eig[,1], main = "Eigenvalues",
names.arg = paste("Dim", 1:nrow(res.pca$eig), sep = ""))
Dim1 Dim2 Dim3 Dim4 Dim5 Dim6 Dim7 Dim8 Dim9
Eigenvalues
0.0
1.0
2.0
3.0
Reducao de dimensionalidade: PCA
# usar variaveis 3 e 4
plot(res.pca, choix = "var", axes = c(3, 4),
lim.cos2.var = 0) # mostrar var so com qualidade > 0
●
−2 −1 0 1 2
−1.
0−
0.5
0.0
0.5
1.0
Variables factor map (PCA)
Dim 3 (14.05%)
Dim
4 (
10.5
7%)
100mLong.jumpShot.put
High.jump400m
110m.hurdle
Discus
Pole.vaultJaveline
1500mRank
Points
Reducao de dimensionalidade: pacote dimRed
iris.pca <- embed(loadDataSet("Iris"), "PCA")
quality(iris.pca); plot(iris.pca)
## 25
## 0.6526642
PC1
−1.0 −0.5 0.0 0.5 1.0
−3
02
4
● ●● ●●●
●● ●●● ● ●●
●● ● ●●
● ●●●
● ● ●●●● ●●●●● ●●● ● ●●● ●●●●
●● ●● ●
●●
●
●
●●
●
●
●
●●
● ●●
●
● ●●
●
●
●●
●●●●
●●●
● ●●●
●●●
●●
● ●●●
●
●
●●●●
●
●
●
●
●●
●
●
●
●●
●
● ●●
●●●●
● ●
●
●
●
●
●
●●
●●●●
●●
●● ●
●
● ●●
● ●●
●
●●●
●●●●
−3 −2 −1 0 1 2 3 4
−1.
00.
5
●
●●●
●
●
●●
●
●
●
●●
●
●●
●
●
●
● ●●● ●
●●●
●●
●●
●
●●
●●
●
●
●
●●
●
●
●●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●●
●
●●●●
●●●
●
●● ●●
●●
●●
● ●●
●●
●●
● ●
●
●
●
●●●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●●
●●
●
●
●
●
●
●
●
●●
●● ●
● ●
●
●●
●
●
●●●
●●
●
●
●●●
●
●●
●
PC2
Reducao de dimensionalidade: pacote dimRed
print(dimRedMethodList())
## [1] "DiffusionMaps" "DRR"
## [3] "FastICA" "KamadaKawai"
## [5] "DrL" "FruchtermanReingold"
## [7] "HLLE" "Isomap"
## [9] "kPCA" "LaplacianEigenmaps"
## [11] "LLE" "MDS"
## [13] "nMDS" "PCA"
## [15] "tSNE"
print(dimRedQualityList())
## [1] "Q_local" "Q_global"
## [3] "mean_R_NX" "AUC_lnK_R_NX"
## [5] "total_correlation" "cophenetic_correlation"
## [7] "distance_correlation" "reconstruction_rmse"
Reducao de dimensionalidade: pacote dimRed
iris.ica <- embed(loadDataSet("Iris"), "FastICA")
quality(iris.ica, "distance_correlation"); plot(iris.ica)
## [1] 0.9149342
ICA1
−1.5 −0.5 0.0 0.5 1.0 1.5
−2
02
●
●●●
●
●
●●
●
●
●
●●
●
●●
●
●
●
●● ●●●
●●●
●●
●●
●
●●
●●
●
●
●
●●
●
●
●●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●●
●
● ●● ●
● ●●
●●
●●●
●●
●●
●● ●
●●
● ●
●●
●
●
●
●●●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●●
●●
●
●
●
●
●
●
●
●●
●●●
●●
●
● ●
●
●
●●●
●●
●
●
●● ●
●
●●
●
−2 −1 0 1 2 3
−1.
50.
01.
5
●●●● ● ●● ●● ● ●●●● ● ●●● ●●●●●
●●● ● ●●●● ●● ●
● ● ●●● ●●●●
● ●● ●● ●●
●●●
●●● ●
●
●●●
●●●
●●●
●●
●●●
●●
● ●●●
●
●●● ●
●● ● ●●
●●● ●●
●● ●● ●
●
●
●
●●
● ●●
●
●● ●
●● ●
● ● ●●
●●
●●
●
●
●● ●
●●
● ●● ●
●●
●●
●●●
●●●●
●●●● ●●●
ICA2
Exercıcios: mapaI Fazer grafico de rotas entre aeroportos usando flightsI Posicoes geograficas: airports.dat [link]. Segue exemplo:
airports <- tbl_df(read.csv('airports.dat',header=FALSE))
colnames(airports) <- c('ID', 'name', 'city', 'country',
'IATA_FAA','ICAO','lat','lon',
'alt','timez','T','DST', 'arpt','O')
select(airports,lat,lon,IATA_FAA,country) %>%
filter(IATA_FAA %in% c("LGA","MIA"))
## # A tibble: 2 × 4
## lat lon IATA_FAA country
## <dbl> <dbl> <fctr> <fctr>
## 1 25.7932 -80.2906 MIA United States
## 2 40.7772 -73.8726 LGA United States
#filter(airports, country == 'Brazil')
#%>% select(IATA_FAA,lat,lon)
Mapas: rworldmap
aeroportos <- filter(airports, country == 'Brazil') %>%
select(lat,lon)
plot(getMap(), xlim=c(-60,-50), ylim=c(-35, 10))
points(aeroportos$lon, aeroportos$lat,
col='red',pch=20,cex=.4)
lines(aeroportos[1:2,]$lon, aeroportos[1:2,]$lat, col='blue')
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●
●
●
●
●
●
● ●
● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
● ●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●
●●
●
●
●
●●
●●
●
●
●
●
●
●●●
●
●●
●
● ●
●
●
●
●
●
●
●
●
●
●
● ●
●
●
●
Trabalho 1
I Dataset: “Cessoes de Direitos Minerarios”I https://app.dnpm.gov.br/DadosAbertos/SCM/Cessoes_de_
Direitos.csvI Na linha 16934, trocar “D”” por D
I Fazer relatorio contendo estatısticas-resumo, testes de hipoteses egraficos que provem informacoes para responder as seguintesperguntas:
I Quais sao os estados que mais e menos concedem direitosminerarios?
I Quais as substancias mais procuradas nos ultimos 2,5 e 10 anos?I Existem empresas que recebem um numero de concessoes muito
superior a outras?I Quais sao as empresas que tem monopolios/duopolios nacionais de
extracao de substancias?I Existe regioes de concentracao geografica de concessoes?
I Lat/lon das cidades brasileiras [link]
I Entregar arquivos “.Rnw” e “.pdf”
Exercıcios
I Formule perguntas interessantes que podem ser respondidas commineracao de dados do dataset que lhe for atribuıdo.
I Realize um estudo exploratorio sobre variaveis relevantes pararesponder as perguntas formuladas.