Simple uses of the vagalumeR package

The website https://www.vagalume.com.br is where we can obtain information about songs and their lyrics (with translations, when they’re not in portuguese). This same website has an API, which can be accessed in R, through the vagalumeR package. Here, we have some demonstrations about the functionalities of the package, focusing on the analysis of music lyrics.

# Package loading
library(vagalumeR)
library(tidyverse)
library(tm)
library(lattice)
library(lexiconPT)
library(latticeExtra)
library(wordcloud)
library(networkD3)

packageVersion("vagalumeR")
[1] '0.1.1'

To access the API, the user must have in hand his access token, which can be obtainde at the web address: https://auth.vagalume.com.br/settings/api/ The steps are:

  1. Access the link above,
  2. Create an account,
  3. Access the link again to get the token.
key <- "insert-your-token"
# Artist selection
artist <- "chico-buarque"

# Informations
(info <- artistInfo(artist))
                 id          name views pos period uniques points
1 3ade68b4g66c6eda3 Chico Buarque 10436  71 201806    5180   19.2
# Discography
disc <- discography(artist)
head(disc, 10)
            album.id          album.name           label date
1  3ade68b6g040dfda3           Caravanas   Biscoito Fino 2017
2  3ade68b6g5b88fda3               Chico   Biscoito Fino 2011
3  3ade68b6g4638fda3             Carioca   Biscoito Fino 2006
4  3ade68b6g89d6fda3              Duetos                 2002
5  3ade68b6g18d6fda3          O Sambista                 2000
6  3ade68b6g48d6fda3 As Cidades -Ao Vivo Universal Music 1999
7  3ade68b6g29d6fda3          As Cidades                 1999
8  3ade68b6g39d6fda3         Uma Palavra                 1995
9  3ade68b6g19d6fda3           Paratodos                 1994
10 3ade68b6g69d6fda3      Ao Vivo: Paris             BMG 1990
                  id          name
1  3ade68b4g66c6eda3 Chico Buarque
2  3ade68b4g66c6eda3 Chico Buarque
3  3ade68b4g66c6eda3 Chico Buarque
4  3ade68b4g66c6eda3 Chico Buarque
5  3ade68b4g66c6eda3 Chico Buarque
6  3ade68b4g66c6eda3 Chico Buarque
7  3ade68b4g66c6eda3 Chico Buarque
8  3ade68b4g66c6eda3 Chico Buarque
9  3ade68b4g66c6eda3 Chico Buarque
10 3ade68b4g66c6eda3 Chico Buarque
# Album count per year
xtabs(~date, disc)
date
1966 1967 1968 1970 1971 1972 1973 1974 1976 1978 1980 1981 1982 1984 1985 
   1    1    1    2    1    1    1    1    2    2    1    1    2    1    1 
1987 1989 1990 1994 1995 1999 2000 2002 2006 2011 2017 
   1    2    1    1    1    2    1    1    1    1    1 
# Related artists
(rel <- relatedInfo(artist))[,4]
 [1] Vinicius de Moraes Gilberto Gil       Tom Jobim         
 [4] Milton Nascimento  Elis Regina        Maria Bethânia    
 [7] Caetano Veloso     Zeca Baleiro       Maria Rita        
[10] Lenine            
10 Levels: Caetano Veloso Elis Regina Gilberto Gil ... Zeca Baleiro
# Lyrics search
song <- songNames(artist)
let <- plyr::ldply(map(song$song.id[1:10], 
                       lyrics, 
                       type = "id", 
                       key = key),
                   data.frame)

str(let)
'data.frame':   10 obs. of  6 variables:
 $ id      : Factor w/ 1 level "3ade68b4g66c6eda3": 1 1 1 1 1 1 1 1 1 1
 $ name    : Factor w/ 1 level "Chico Buarque": 1 1 1 1 1 1 1 1 1 1
 $ song.id : Factor w/ 10 levels "3ade68b8g4c56cfa3",..: 1 2 3 4 5 6 7 8 9 10
 $ song    : Factor w/ 10 levels "A Aurora de Nova Iorque",..: 1 2 3 4 5 6 7 8 9 10
 $ language: int  1 1 1 1 1 1 1 1 1 1
 $ text    : chr  "A aurora de nova iorque tem  Quatro colunas de lodo  E um furacão de pombas  Que explode as águas podres.   A a"| __truncated__ "Estava à toa na vida O meu amor me chamou Pra ver a banda passar Cantando coisas de amor  A minha gente sofrida"| __truncated__ "Ouve a declaração, oh bela De um sonhador titã Um que dá nó em paralela  E almoça rolimã O homem mais forte do "| __truncated__ "Na cidade Ser artista É posar sorridente É ver se de repente Sai numa revista  É esperar que o orelhão Complete"| __truncated__ ...
#--------------------------------------------
# Word frequencies --------------------------
#--------------------------------------------
  
# Create & clean the corpus
cps <- VCorpus(VectorSource(let$text),
                 readerControl = list(language = "pt"))
cps <- tm_map(cps, FUN = content_transformer(tolower))
cps <- tm_map(cps, FUN = removePunctuation)
cps <- tm_map(cps, FUN = removeNumbers)
cps <- tm_map(cps, FUN = stripWhitespace)
cps <- tm_map(cps,
                FUN = removeWords,
                words = stopwords("portuguese"))
cps <- tm_map(cps,
                FUN = removeWords,
                words = "\t")

# Create document-term matrix
dtm <- DocumentTermMatrix(cps)

inspect(dtm)
<<DocumentTermMatrix (documents: 10, terms: 570)>>
Non-/sparse entries: 655/5045
Sparsity           : 89%
Maximal term length: 14
Weighting          : term frequency (tf)
Sample             :
    Terms
Docs amor banda bela bonita cidade finjo nunca passar pra ver
  1     0     0    0      0      1     0     0      0   0   0
  10    1     0    0      0      0     0     0      0   1   0
  2     6     7    0      0      1     0     0      5   6   5
  3     0     0    5      0      0     0     0      0   0   0
  4     0     0    0      0      3     0     0      0   0   1
  5     0     0    1      0      9     0     1      0   0   0
  6     0     0    0      0      0     0     0      0   0   0
  7     0     0    0      0      0     0     0      0   0   0
  8     0     0    0      0      0     0     7      0   0   0
  9     0     0    0      7      0     8     0      2   4   4
# Minimum frequency terms
tms <- findFreqTerms(dtm, lowfreq = 3)
str(tms)
 chr [1:63] "águas" "alamedas" "amor" "amores" "artista" "atenção" ...
# Associated terms
assoc <- findAssocs(dtm, terms = tms, corlimit = 0.9)
b <- data.frame(pal = "amor", assoc = names(assoc$amor))

# Visualizing the network
simpleNetwork(b,
              opacity = 0.8, 
              linkColour= "violet", 
              nodeColour="tomato", 
              zoom=TRUE, fontSize = 16,
              linkDistance = 70)
# Word count
frq <- slam::colapply_simple_triplet_matrix(dtm, FUN = sum)
frq <- sort(frq, decreasing = TRUE)

# Plotting the 30 most frequent words
barchart(head(frq, n = 30), xlim = c(0, NA),
         col =  "lightsalmon", 
         xlab = "Frequency",
         ylab = "Words",
         main = "Chico Buarque songs",
         strip = strip.custom(bg = "white"))

# Wordcloud
wordcloud(cps,
          family = "serif", 
          min.freq = 3,
          max.words = 30,
          colors = brewer.pal(7, "Paired"))

#--------------------------------------------
# Polarities --------------------------------
#--------------------------------------------

# Sentiments dictionary in portuguese - lexiconPT
dic <- lexiconPT::oplexicon_v3.0
str(dic)
'data.frame':   32191 obs. of  4 variables:
 $ term             : chr  "=[" "=@" "=p" "=P" ...
 $ type             : chr  "emot" "emot" "emot" "emot" ...
 $ polarity         : int  -1 -1 -1 -1 -1 1 1 1 1 -1 ...
 $ polarity_revision: chr  "A" "A" "A" "A" ...
# Intersections: dictionary & songs
inter <- intersect(x = Terms(dtm),
                   y = dic$term)
length(inter)
[1] 122
# Brings polarities to words in the lyrics
lex <- merge(x = data.frame(term = inter,
                            stringsAsFactors = FALSE),
             y = dic,
             sort = FALSE)
head(lex, 10)
           term type polarity polarity_revision
1  administrada  adj        0                 A
2        alegre  adj        1                 M
3          amar   vb        1                 A
4    analfabeto  adj       -1                 A
5      apertado  adj       -1                 M
6       artista  adj        0                 A
7      artistas  adj        0                 A
8          azul  adj        0                 M
9         beber   vb        0                 A
10         bela  adj        1                 M
# Ordinary matrix
m <- as.matrix(dtm)
m <- m[, lex$term]

# Row sums
rtot <- rowSums(m)

# "Loadings" of each song
(carga <- (m %*% lex$polarity)/rtot)
    
Docs        [,1]
  1   0.37500000
  2   0.00000000
  3   0.33333333
  4   0.32258065
  5   0.06976744
  6  -0.35714286
  7   0.00000000
  8   0.44444444
  9   0.26923077
  10  0.27272727
# Sum of polarities per song
m %*% cbind(lex$polarity)
    
Docs [,1]
  1     3
  2     0
  3     5
  4    10
  5     3
  6    -5
  7     0
  8     4
  9     7
  10    3
# Function to calculate the proportion of each polarity
pp <- function(song){
  
  cps <- VCorpus(VectorSource(song),
                 readerControl = list(language = "pt"))
  cps <- tm_map(cps, FUN = content_transformer(tolower))
  cps <- tm_map(cps, FUN = removePunctuation)
  cps <- tm_map(cps, FUN = removeNumbers)
  cps <- tm_map(cps, FUN = stripWhitespace)
  cps <- tm_map(cps,
                FUN = removeWords,
                words = stopwords("portuguese"))
  cps <- tm_map(cps,
                FUN = removeWords,
                words = "\t")
  
  dtm <- DocumentTermMatrix(cps)
  
  inter <- intersect(x = Terms(dtm),
                     y = dic$term)
  
  lex <- merge(x = data.frame(term = inter,
                              stringsAsFactors = FALSE),
               y = dic,
               sort = FALSE)
  
  pp <- c(prop.table(xtabs(~polarity, lex)))
  
  return(pp)  
}


(pcs <- plyr::ldply(map(let$text, pp), matrix, ncol = 3))
           1         2         3
1  0.1428571 0.4285714 0.4285714
2  0.3809524 0.3809524 0.2380952
3  0.3000000 0.3000000 0.4000000
4  0.0800000 0.6000000 0.3200000
5  0.3333333 0.3750000 0.2916667
6  0.4615385 0.3846154 0.1538462
7  0.2857143 0.2857143 0.4285714
8  0.1428571 0.4285714 0.4285714
9  0.3000000 0.4000000 0.3000000
10 0.2727273 0.1818182 0.5454545
names(pcs) <- c("neg", "neutro", "pos")

# Cumulative distributions of the sentiments in 
# Chico Buarque songs 
ecdfplot(~pos+neg+neutro, pcs, 
         col = c("tomato", "turquoise", "orange"),
         key = list(columns = 1,
                    corner = c(0.05, 0.9),
                    lines = list(col = c("tomato", "turquoise",
                                         "orange")),
                                 text = list(c("neg", "neutro",
                                               "pos"))))

comments powered by Disqus