Wordcloud

3 minute read

I wrote a shiny app to extract the words from leading hungarian online news sites and plot a wordcloud from them. You can find the application here: Wordcloud

Note: the application’s language is Hungarian!

The code for the application:

ui.R

# Define the UI
#libraries
library(shiny)
library(rvest)
library(dplyr)
library(tm)
library(jsonlite)
library(corpus)
library(hunspell)
library(wordcloud2)
library(extrafont)
loadfonts()

fluidPage(
        pageWithSidebar(
                # Application title
                headerPanel("Magyar hírportálok címlapjainak szófelhői"),
                # Sidebar with inputs and buttons
                sidebarPanel(
#                        htmlOutput("valami", inline = TRUE),
#                        tags$hr(),
                        selectInput('data', "Hírportál:",
                                    c("24.hu" = "24.hu",
                                      "444" = "444",
                                      "888" = "888",
                                      "index" = "index",
                                      "Magyar Nemzet Online" = "Magyar Nemzet Online",
                                      "Origo" = "Origo")),
                        sliderInput('size', 'Betűméret', min = 0.2, max = 1,
                                    value = 0.4,step = 0.1),
                        actionButton("update_h4", "24.hu frissítése"),
                        actionButton("update_n", "444 frissítése"),
                        actionButton("update_ny", "888 frissítése"),
                        actionButton("update_i", "index frissítése"),
                        actionButton("update_mno", "Magyar Nemzet Online frissítése"),
                        actionButton("update_o", "Origo frissítése")),
                # Main Panel
                mainPanel(
                        #tags$head(
                        #        tags$style(HTML('div#wcLabel {display: none;}'))
                        #),
                        wordcloud2Output('wordcloud2',height = "500px"))
        )
)

server.R

# Hungarian stopword dictionary
hu_stopwords <- fromJSON('https://raw.githubusercontent.com/6/stopwords-json/master/dist/hu.json')

# Set the dictionary path:
# setwd('/Users/mac/Desktop/Data Science/Pet Projects/Online Media Frontpage Wordcloud/')
hun <- dictionary("hu_HU.dic")
#print(hun)

# Custom functions

# Stemming
myStem <- function(x) {
        res <- hunspell_stem(x, dict = hun)
        idx <- which(lengths(res)==0)
        if (length(idx)>0){
                res[idx] <- x[idx]
        }
        sapply(res, tail, 1)
}
# Content transformer for tm_map
hunstemmer <- content_transformer(function(txt){paste(myStem(strsplit(txt, " ")[[1]]))})

# Text processing
text_proc <- function(x) {
        # Create corpus
        temp <- VCorpus(VectorSource(x))
        # Remove Punctation, Numbers
        temp <- tm_map(temp, removePunctuation)
        temp <- tm_map(temp, removeNumbers)
        # Remove Whitespace
        temp <- tm_map(temp, stripWhitespace)
        # Stem words
        temp <- tm_map(temp, hunstemmer)
        # Remove Stopwords
        temp <- tm_map(temp, removeWords, hu_stopwords)
        # Create Term Document Matrix
        TDM <- TermDocumentMatrix(temp, control = list(wordLengths=c(2,Inf)))
        TDM_mat <- as.matrix(TDM)
        # Create frequencies of words
        TDM_freq <- sort(rowSums(TDM_mat), decreasing=TRUE)
        result <- data.frame(word = names(TDM_freq), freq=TDM_freq)
        rm(temp, TDM, TDM_freq)
        return(result)
}

# Web Scraping ORIGO, index.hu, 444.hu, 24.hu, mno.hu, 888.hu
get_o <- function(){
        # ORIGO
        o_url <- read_html('http://www.origo.hu/index.html')
        o_cikk <- o_url %>% html_nodes(".news-text-block h2, p") %>% html_text()
        o_words <- text_proc(o_cikk)
        return(o_words)
}
get_i <- function(){
        # index.hu
        i_url <- read_html('http://www.index.hu')
        i_cikk <- i_url %>% html_nodes(".cikkcim a, .ajanlo span") %>% html_text()
        i_words <- text_proc(i_cikk)
        return(i_words)
}
get_n <- function(){
        # 444.hu
        n_url <- read_html('http://www.444.hu')
        n_cikk <- n_url %>% html_nodes("h1.title, h4 a, p") %>% html_text()
        n_words <- text_proc(n_cikk)
        return(n_words)
}
get_h4 <- function(){
        # 24.hu
        h4_url <- read_html('http://www.24.hu')
        h4_cikk <- h4_url %>% html_nodes(".post-title, .lead-after-wrap p") %>% html_text()
        h4_words <- text_proc(h4_cikk)
        return(h4_words)
}
get_mno <- function(){
        # mno.hu
        mno_url <- read_html('http://www.mno.hu')
        mno_cikk <- mno_url %>% html_nodes(".content h3, .lead p") %>% html_text()
        mno_words <- text_proc(mno_cikk)
        return(mno_words)
}
get_ny <- function(){
        # 888.hu
        ny_url <- read_html('http://www.888.hu')
        ny_cikk <- ny_url %>% html_nodes(".fig-wrap h2, .col-holder h2,
                                         .text-box p, .text-wrap h2, .text-wrap p") %>% html_text()
        ny_words <- text_proc(ny_cikk)
        return(ny_words)
}

# Define mapping table
media_fonts <- data.frame(t(c('Titillium-Bold','HelveticaNeue','Roboto-Bold','OpenSans-Regular','Publico-Medium','SourceSansPro-Bold')),
                          stringsAsFactors = F)
#media_fonts <- data.frame(t(test), stringsAsFactors = F)
dfs <- data.frame(t(c('h4_words','n_words', 'ny_words', 'i_words', 'mno_words', 'o_words')),
                  stringsAsFactors = F)
color <- data.frame(t(c('#fff','#222','#fff','#323232','#000','#fff')),
                    stringsAsFactors = F)
bcolor <- data.frame(t(c('#64667a','#ffff73','#e01365','#fff','#DCEBF0','#155092')),
                     stringsAsFactors = F)
last_update <- data.frame(t(c('24.hu','444.hu','888.hu','index.hu','MNO','Origo')),
                          stringsAsFactors = F)
keys <- rbind(dfs,media_fonts,color,bcolor,last_update)
colnames(keys) <- c("24.hu", "444", "888", "index", "Magyar Nemzet Online", "Origo")

# Define the server code
function(input, output, session) {
        # Progress bar
        progress <- Progress$new(session, min=1, max=7)
        progress$set(message = 'Adatok begyűjtése',
                     detail = 'Ez eltarthat egy ideig...')
        progress$set(value = 1)
        o_words <- get_o()
        progress$set(value = 2)
        i_words <- get_i()
        progress$set(value = 3)
        n_words <- get_n()
        progress$set(value = 4)
        h4_words <- get_h4()
        progress$set(value = 5)
        mno_words <- get_mno()
        progress$set(value = 6)
        ny_words <- get_ny()
        progress$set(value = 7)
        progress$close()
        #output$valami <- renderUI({HTML("text()")})
        output$wordcloud2 <- renderWordcloud2({
                wordcloud2(data=get(keys[1,input$data]), size=input$size,
                           color = keys[3,input$data],
                           backgroundColor = keys[4,input$data],
                           fontFamily = keys[2,input$data])
        })
        observeEvent(input$update_h4, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_h4()
                keys[5,1] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
        observeEvent(input$update_n, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_n()
                keys[5,2] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
        observeEvent(input$update_ny, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_ny()
                keys[5,3] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
        observeEvent(input$update_i, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_i()
                keys[5,4] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
        observeEvent(input$update_mno, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_mno()
                keys[5,5] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
        observeEvent(input$update_o, {
                progress <- Progress$new(session, min=1, max=2)
                progress$set(message = 'Adatok begyűjtése',
                             detail = 'Ez eltarthat egy ideig...')
                progress$set(value = 1)
                get_o()
                keys[5,6] <- format(Sys.time(), "%Y. %m. %d. %H:%M:%S")
                progress$set(value = 2)
                progress$close()
        })
}

Leave a Comment