budapest users of r network - 2013. november 27

SzövegbányászatBURN Meetup, 2013. november 27.

Vázlat

1. Bővebben

2. Odi et amo

3. Előfeldolgozás

4. TermDocumentMatrix

5. Számoljunk szavakat

6. A szavak titkos élete

Kreatív kölcsönzés

• Ingo Feinerer: A text mining framework in R and its applications

• Gaston Sanchez: Mining Twitter with R

• Machine Learning for Hackers

• Natural Language Annotation for Machine Learning

Odi et amo

• Elvileg mindent megcsinálhatunk R-ben

• Elvileg!

• Gyakorlatilag egyszerűbb barkácsolni

Alternatívák

• Python NLTK - átfogó, nagyon jó

• de nem annyira mint az R

• OpenNLP/GATE/UIMA - robosztusabb, gyorsabb, skálázhatóbb

• de nem csak a főbb eljárásokat implementálták

Magyar

• boilerpipe - netes szövegek beszerzésére

• ocamorph - szótövezésre

• hunpos - POS tagging

• Érdemesebb az előfeldolgozást Python-ban végezni (mindegyikhez van wrapper)

• puristák system()

Előfeldolgozás

library(tm)

oz <- Corpus(DirSource("OzBooks/"))

meta(oz, tag = "Author", type = "local") <-

c(rep("Lyman Frank Baum", 3),

rep("Ruth Plumly Thompson", 2))

Előfeldolgozás

# adding metadata

meta(oz, "Heading", "local") <-

c("The Wonderful Wizard of Oz",

"The Marvelous Land of Oz",

"Ozma of Oz",

"The Royal Book of Oz",

"Ozoplaning with the Wizard of Oz")

TermDocumentMatrix

# Term Doc Matrices

ozMatBaum <- TermDocumentMatrix(oz[1:3])

ozMatRoyal <- TermDocumentMatrix(oz[4])

ozMatThompson <- TermDocumentMatrix(oz[5])

TermDocumentMatrix

TermDocumentMatrix

findFreqTerms(x, lowfreq = 0, highfreq = Inf)

findFreqTerms(ozMatBaum, 130)

[1] "after" "all" "and" "\"and"

"answered" "any"

TermDocumentMatrix

removeSparseTerms(x, sparse)

ozMat.sparse <- removeSparseTerms(ozMat,

0.2)

Számoljunk szavakat! (A Twitteren)

library(ROAuth)

library(twitteR)

twitteR

## Twitter login

reqURL <- "https://api.twitter.com/oauth/request_token"

accessURL <- "http://api.twitter.com/oauth/access_token"

authURL <- "http://api.twitter.com/oauth/authorize"

twitteR

consumerKey <- "YourConsumerKey"

consumerSecret <- "YourSecretKey"

twitteR

twitCred <- OAuthFactory$new(consumerKey=consumerKey,

consumerSecret=consumerSecret,

requestURL=reqURL,

accessURL=accessURL,

authURL=authURL)

twitteR

> twitCred$handshake()

To enable the connection, please direct your web browser to:

http://api.twitter.com/oauth/authorize?oauth_token=XXXXXjRac3X3XX4dGrCVhXXXXXW22VycNXFb6U

When complete, record the PIN given to you and provide it here: XXXXXXX

> registerTwitterOAuth(twitCred)

[1] TRUE

• RStudio momentán nem képes ezt kezelni!!!!

twitteR

tomi_tweets <- userTimeline("dajcstomi", n=500)

# tweets dataframe

tomi_df <- twListToDF(tomi_tweets)

# get the text

tomi_txt <- tomi_df$text

Normalizálás 1. - gsub()

# remove punctuation symbols

tomi_clean <- gsub("[[:punct:]]", "", tomi_txt)

# remove numbers

tomi_clean <- gsub("[[:digit:]]", "", tomi_clean)

Normalizálás 2. - tm_map()

# corpus

tomi_corpus <- Corpus(VectorSource(tomi_txt))

# convert to lower case

tomi_corpus <- tm_map(tomi_corpus, tolower)


# remove stoprwords

tomi_corpus <- tm_map(tomi_corpus, removeWords, c(stopwords("hungarian"),

"dajcstomi"))

# remove extra white-spaces

tomi_corpus <- tm_map(tomi_corpus, stripWhitespace)


# stem document

tomi_corpus <- tm_map(tomi_corpus, stemDocument, language="hungarian")

# term-document matrix

tdm <- TermDocumentMatrix(tomi_corpus)

@dajcstomi

# characters per tweet

> chars_per_tweet <- sapply(tomi_clean, nchar)

> summary(chars_per_tweet)

Min. 1st Qu. Median Mean 3rd Qu. Max.

0.00 50.00 92.00 82.97 118.00 138.00

@dajcstomi

# words per tweet

words_per_tweet <-

sapply(words_list, length)

@dajcstomi

uniq_words <-

sapply(words_list,

function(x) length(unique(x)))

@dajcstomi

# most frequent words

mfw <-

sort(table(unlist(words_list)),

decreasing=TRUE)

# top-20

top20 <- head(mfw, 20)

@dajcstomi vs webkorpusz

Szógyakoriság – mit mond?

rezsi_corpus <- Corpus(VectorSource(rezsi_txt))

rezsi_corpus <- tm_map(rezsi_corpus, tolower)

rezsi_corpus <- tm_map(rezsi_corpus, removeWords, stopwords("hungarian"))

rezsi_corpus <- tm_map(rezsi_corpus, stripWhitespace)

rezsi_corpus <- tm_map(rezsi_corpus, stemDocument, language="hungarian")

rezsi.tdm <- TermDocumentMatrix(rezsi_corpus)

rezsi.m <- as.matrix(rezsi.tdm)

rezsi.tdm2 <- removeSparseTerms(rezsi.tdm, sparse=0.95)

Szógyakoriság

• eltérünk Zipf-től

• extrém eset

Szógyakoriság

library(cluster)

library(FactoMineR)

Szófelhő

tweets <- searchTwitter("fidesz OR mszp", n=1000, lang="hu")

Szófelhő

word_freqs <- sort(rowSums(m), decreasing=TRUE)

dm <- data.frame(word=names(word_freqs), freq=word_freqs)

wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

Szófelhő

Összehasonlító szófelhők# Bajnai Gordon

bg_tweets = userTimeline("Bajnai_Gordon", n=1000)

# Mesterházy Attila

ma_tweets = userTimeline("mesterhazymszp", n=1000)

# Deutsch Tamás

dt_tweets = userTimeline("dajcstomi", n=1000)

# Szanyi Tibor

st_tweets = userTimeline("szanyitibor", n=1000)

# Jávor Benedek

jb_tweets = userTimeline("javorbenedek", n=1000)

Comparison

comparison.cloud(tdm,

random.order=FALSE,

colors = c("#00B2FF",

"red",

"#FF0099",

"#6600CC",

"brown"),

title.size=1)

Commonality

commonality.cloud(tdm,

random.order=FALSE,

colors = brewer.pal(8, "Dark2"),

title.size=1)

Mi a baj a szófelhőkkel?

• Gyakorisági táblák, csak szebbek• Nem mondanak el semmit egy szövegről,

normális esetben• Több szövegről nem képesek beszélni• Túl kell lépni rajtuk!

Szófelhők helyett

• Összehasonlító szófelhő• Közös előfordulások ábrázolása• Conway szófelhő

Ami kimaradt

• Dokumentum összehasonlítás• Információkinyerés (IR) és keresés• Korpusznyelvészet – pl. mutual information,

Google sets• Entity extraction, relation mining• POS tagging, stemming, szemantika

Take-home message

• NLP pipeline (előfeldolgozás, normalizálás, elemzés)

• Barkácsolás (nyugodtan használd amit a neten találsz)

• Értelmezés (a szövegbányászat sokszor nem technikai, hanem értelmezési probléma)

A szavak titkos élete

• Pennebaker: The Secret Life of Pronouns• Miller: Spent• Szentiment- és emócióelemzés

Köszönöm a figyelmet!

@zoltanvarju

Kereső Világ http://kereses.blog.hu/

[email protected]

budapest users of r network - 2013. november 27

Technology