Web Scraping and Data Collection Using RSelenium:

Scraping Sotheby’s art auctions


Guadalupe González

Sep 25, 2024

Materials for today’s class:






https://github.com/guadag12/selenium-r-workshop

Configuration issues first!

We are going to use Chrome to do the scraping. So, the following instuctions are needed.

1. Check Your Google Chrome Version:

Open Google Chrome and navigate to chrome://settings/help to find out your current Chrome version.

2. Execute the following command in R:

wdman::selenium(retcommand = TRUE, check = FALSE)
[1] "C:\\PROGRA~2\\COMMON~1\\Oracle\\Java\\javapath\\java.exe c(\"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/chromedriver.exe\\\"\", \"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/LICENSE.chromedriver\\\"\", \"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/THIRD_PARTY_NOTICES.chromedriver\\\"\") -Dwebdriver.gecko.driver=\"C:\\Users\\User\\AppData\\Local\\binman\\binman_geckodriver\\win64\\0.35.0/geckodriver.exe\" -Dphantomjs.binary.path=\"C:\\Users\\User\\AppData\\Local\\binman\\binman_phantomjs\\windows\\2.1.1/phantomjs-2.1.1-windows/bin/phantomjs.exe\" -jar \"C:\\Users\\User\\AppData\\Local\\binman\\binman_seleniumserver\\generic\\4.0.0-alpha-2/selenium-server-standalone-4.0.0-alpha-2.jar\" -port 4567"


After that go to the folder mentioned in the command (copy and paste the one that you got): “C:\\Users\\User\\AppData\\Local\\binman\\binman_chromedriver\\win32\\”

So I will look for: “C:/Users/User/AppData/Local/binman/binman_chromedriver/win32”

3. ChromeDriver:

  1. Download the ChromeDriver matching your Chrome version from Chrome for Testing.

If you are using Mac, you will download the Mac version that ajust to your Google Chrome Version. If you are using Windows, the same for Windows:

3. ChromeDriver:

    1. After unzip the download chromedriver, move it the downloaded file to the directory selected in step 2. In my case: “C:/Users/User/AppData/Local/binman/binman_chromedriver/win32”

4. Verify Installation:

Check if the ChromeDriver is correctly set up by running:

# Check and install wdman if not already installed
if (!requireNamespace("wdman", quietly = TRUE)) {
  install.packages("wdman")
}

wdman::selenium()
$process
PROCESS 'file445c6e561e86.bat', finished.

$output
function (timeout = 0L) 
{
    infun_read(seleniumdrv, log, "stdout", timeout = timeout, 
        outfile = pfile[["out"]], errfile = pfile[["err"]])
}
<bytecode: 0x000002279b965538>
<environment: 0x000002279d485ac0>

$error
function (timeout = 0L) 
{
    infun_read(seleniumdrv, log, "stderr", timeout = timeout, 
        outfile = pfile[["out"]], errfile = pfile[["err"]])
}
<bytecode: 0x000002279b96ae28>
<environment: 0x000002279d485ac0>

$stop
function () 
{
    seleniumdrv$kill()
}
<bytecode: 0x000002279b96a840>
<environment: 0x000002279d485ac0>

$log
function () 
{
    infun_read(seleniumdrv, log, outfile = pfile[["out"]], errfile = pfile[["err"]])
    as.list(log)
}
<bytecode: 0x000002279b96a680>
<environment: 0x000002279d485ac0>

5. Start Selenium Server:

  1. Execute the following command in R:
wdman::selenium(retcommand = TRUE, check = FALSE)
[1] "C:\\PROGRA~2\\COMMON~1\\Oracle\\Java\\javapath\\java.exe c(\"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/chromedriver.exe\\\"\", \"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/LICENSE.chromedriver\\\"\", \"-Dwebdriver.chrome.driver=\\\"C:\\\\Users\\\\User\\\\AppData\\\\Local\\\\binman\\\\binman_chromedriver\\\\win32\\\\128.0.6613.121/THIRD_PARTY_NOTICES.chromedriver\\\"\") -Dwebdriver.gecko.driver=\"C:\\Users\\User\\AppData\\Local\\binman\\binman_geckodriver\\win64\\0.35.0/geckodriver.exe\" -Dphantomjs.binary.path=\"C:\\Users\\User\\AppData\\Local\\binman\\binman_phantomjs\\windows\\2.1.1/phantomjs-2.1.1-windows/bin/phantomjs.exe\" -jar \"C:\\Users\\User\\AppData\\Local\\binman\\binman_seleniumserver\\generic\\4.0.0-alpha-2/selenium-server-standalone-4.0.0-alpha-2.jar\" -port 4567"
  1. Run the generated command in the Terminal. Example:
/usr/bin/java -Dwebdriver.chrome.driver="/usr/local/bin/chromedriver" -jar "/usr/local/bin/selenium-server-standalone.jar" -port 4567

5. Start Selenium Server:

  1. Connect RSelenium to the Running Selenium Server:

In R, establish the remote connection as follows:

library(RSelenium)

# Assuming Selenium is running on the specified port (the port should be the same that you have in 5.b.)
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4567L, browserName = "chrome")

# Attempt to open the browser
remDr$open()
[1] "Connecting to remote server"
$acceptInsecureCerts
[1] FALSE

$browserName
[1] "chrome"

$browserVersion
[1] "128.0.6613.121"

$chrome
$chrome$chromedriverVersion
[1] "128.0.6613.137 (fe621c5aa2d6b987e964fb1b5066833da5fb613d-refs/branch-heads/6613@{#1711})"

$chrome$userDataDir
[1] "C:\\Users\\User\\AppData\\Local\\Temp\\scoped_dir27900_1357527518"


$`fedcm:accounts`
[1] TRUE

$`goog:chromeOptions`
$`goog:chromeOptions`$debuggerAddress
[1] "localhost:52702"


$networkConnectionEnabled
[1] FALSE

$pageLoadStrategy
[1] "normal"

$platformName
[1] "windows"

$proxy
named list()

$setWindowRect
[1] TRUE

$strictFileInteractability
[1] FALSE

$timeouts
$timeouts$implicit
[1] 0

$timeouts$pageLoad
[1] 300000

$timeouts$script
[1] 30000


$unhandledPromptBehavior
[1] "dismiss and notify"

$`webauthn:extension:credBlob`
[1] TRUE

$`webauthn:extension:largeBlob`
[1] TRUE

$`webauthn:extension:minPinLength`
[1] TRUE

$`webauthn:extension:prf`
[1] TRUE

$`webauthn:virtualAuthenticators`
[1] TRUE

$webdriver.remote.sessionid
[1] "7746dd002757bb6309d01e4532225f54"

$id
[1] "7746dd002757bb6309d01e4532225f54"

6. If everything works, you will have a new window like this:

What is Selenium? Why use Selenium?

  • Selenium is an open-source framework used for automating web browsers. Allow us to interact with web elements on web pages as if a human user were interacting with the browser. Example: We will see the mouse simulating the movement of us in the webpage.
  • It is supported by R in the package RSelenium.

When to Use Selenium:

  1. Dynamic Web Pages. Example: Javascripts websites (https://www.nytimes.com/)
  1. Interactive Elements
  1. Testing and Automation

When NOT use Selenium:

  1. Static Web Pages. Example: GVPT Faculty
  1. Simple Scraping Tasks. Example: Wikipedia’s pages

Summary:

Use rvest: static content, performing straightforward quick data extraction tasks, needing speed and simplicity.



Use Selenium: interact with dynamic web content, perform complex navigations, automate testing, or scrape data from interactive elements that rely on JavaScript.


Rvest: need help?


A workshop on Rvest has been made by Evan Jones here: https://github.com/gsa-gvpt/gvpt-methods/tree/master/webscraping

Auctions in Sothebys


Website: https://www.sothebys.com/en/auctions/2012/latin-american-art-n08862.html

Get the source page sintax (right click on the website + inspect element):


HTML Sintaxis

HTML Sintaxis for today

CSS (1)

CSS (2)

XPATH

XPath (XML Path Language) is a query language used for selecting nodes* from an XML document, which can also be applied to HTML documents.

XPath allows us to combine with HTML, CSS, and Selenium to have a more efficient way to to interact with elements on a webpage and extract information.

* nodes: “individual parts or components of a document’s structure”. A node can represent different types of elements (text, elements —html—, attributes —css—)

XPATH in Selenium

XPATH in Selenium

Starting to scrape:

  1. Load packages:
library(RSelenium)
library(tidyverse)
library(netstat)
library(httr)
library(wdman)
library(rvest)
  1. Go to the website that we want to scrape:
# Selenium is running in the port that we said:
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4567L, browserName = "chrome")

# Open the Chrome:
remDr$open()
[1] "Connecting to remote server"
$acceptInsecureCerts
[1] FALSE

$browserName
[1] "chrome"

$browserVersion
[1] "128.0.6613.121"

$chrome
$chrome$chromedriverVersion
[1] "128.0.6613.137 (fe621c5aa2d6b987e964fb1b5066833da5fb613d-refs/branch-heads/6613@{#1711})"

$chrome$userDataDir
[1] "C:\\Users\\User\\AppData\\Local\\Temp\\scoped_dir27120_539407128"


$`fedcm:accounts`
[1] TRUE

$`goog:chromeOptions`
$`goog:chromeOptions`$debuggerAddress
[1] "localhost:52724"


$networkConnectionEnabled
[1] FALSE

$pageLoadStrategy
[1] "normal"

$platformName
[1] "windows"

$proxy
named list()

$setWindowRect
[1] TRUE

$strictFileInteractability
[1] FALSE

$timeouts
$timeouts$implicit
[1] 0

$timeouts$pageLoad
[1] 300000

$timeouts$script
[1] 30000


$unhandledPromptBehavior
[1] "dismiss and notify"

$`webauthn:extension:credBlob`
[1] TRUE

$`webauthn:extension:largeBlob`
[1] TRUE

$`webauthn:extension:minPinLength`
[1] TRUE

$`webauthn:extension:prf`
[1] TRUE

$`webauthn:virtualAuthenticators`
[1] TRUE

$webdriver.remote.sessionid
[1] "b8fdf567f156663e72ebcd2244e59de9"

$id
[1] "b8fdf567f156663e72ebcd2244e59de9"
# Tell the webpage that we want to scrape
remDr$navigate("https://www.sothebys.com/en/auctions/2012/latin-american-art-n08862.html")

Scraping the author’s name:

  1. Get the source page:
page_source <- remDr$getPageSource()[[1]] #raw HTML source code
page_html <- read_html(page_source) #parses the raw HTML into a format that allows for HTML element selection and manipulation
page_html
{html_document}
<html class="LotPage" lang="en">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="AuctionPage-body this_one" data-redactable-token="-----BEGIN ...


  1. Extract Action Items and Author’s name:
artworks <- page_html %>% html_nodes(xpath = '//li[@class="AuctionsModule-results-item"]')
artworks
{xml_nodeset (12)}
 [1] <li class="AuctionsModule-results-item">\n                        <div c ...
 [2] <li class="AuctionsModule-results-item">\n                        <div c ...
 [3] <li class="AuctionsModule-results-item">\n                        <div c ...
 [4] <li class="AuctionsModule-results-item">\n                        <div c ...
 [5] <li class="AuctionsModule-results-item">\n                        <div c ...
 [6] <li class="AuctionsModule-results-item">\n                        <div c ...
 [7] <li class="AuctionsModule-results-item">\n                        <div c ...
 [8] <li class="AuctionsModule-results-item">\n                        <div c ...
 [9] <li class="AuctionsModule-results-item">\n                        <div c ...
[10] <li class="AuctionsModule-results-item">\n                        <div c ...
[11] <li class="AuctionsModule-results-item">\n                        <div c ...
[12] <li class="AuctionsModule-results-item">\n                        <div c ...

Scraping the author’s name:

  1. Extract Author’s name:
author <- artworks %>% html_node(xpath = './/div[@class="title "]/a') %>% html_text(trim = TRUE)
author
 [1] "1. Diego Rivera"           "2. Fernando Botero"       
 [3] "3. José Pancetti"          "4. Héctor Poleo"          
 [5] "5. Emiliano di Cavalcanti" "6. Matta"                 
 [7] "7. Leonora Carrington"     NA                         
 [9] "9. Carlos Mérida"          "10. Francisco Toledo"     
[11] "11. Armando Reverón"       "12. Fernando Botero"      

Scraping the title of each piece of art:

  1. Extract Title of the art piece:
title <- artworks %>% html_node(xpath = './/div[@class="description"]') %>% html_text(trim = TRUE)
title
 [1] "Mujer con dalias"                     
 [2] "Naturaleza muerta con frutas"         
 [3] "Saquarema"                            
 [4] "Maternidad"                           
 [5] "Natureza morta"                       
 [6] "Mourir pour le pain (Composition 50F)"
 [7] "Santa Teresa en la cocina"            
 [8] ""                                     
 [9] "Fecundidad"                           
[10] "Autorretrato"                         
[11] "Desnudo detrás de la mantilla"        
[12] "Donna Seduta"                         

Scraping the estimate value of each piece of art:

  1. Extract price of the art piece:
estimate_text <- artworks %>% html_node(xpath = './/div[@class="estimate"]') %>% html_text(trim = TRUE)
estimate_text
 [1] "Estimate: 40,000 – 60,000 USD"       "Estimate: 40,000 – 50,000 USD"      
 [3] "Estimate: 40,000 – 60,000 USD"       "Estimate: 150,000 – 200,000 USD"    
 [5] "Estimate: 150,000 – 200,000 USD"     "Estimate: 250,000 – 350,000 USD"    
 [7] "Estimate: 350,000 – 450,000 USD"     "Estimate: 2,000,000 – 3,000,000 USD"
 [9] "Estimate: 40,000 – 60,000 USD"       "Estimate: 90,000 – 120,000 USD"     
[11] "Estimate: 700,000 – 900,000 USD"     "Estimate: 300,000 – 350,000 USD"    
  1. Split the vector into two separate vectors—one containing the values before the dash (” –“) and another containing the values after the dash—:
result <- tibble(estimate_text) %>%
  mutate(values = str_remove_all(estimate_text, "Estimate: | USD"), # Remove unnecessary text
         values = str_remove_all(values, ",")) %>%           # Remove commas
  separate(values, into = c("before", "after"), sep = " – ") %>% # Split into before and after
  mutate(across(c(before, after), as.numeric))          # Convert to numeric

# Extracting the minimum and maximum vectors
min_estimate <- result$before
max_estimate <- result$after
print(min_estimate)
 [1]   40000   40000   40000  150000  150000  250000  350000 2000000   40000
[10]   90000  700000  300000
print(min_estimate)
 [1]   40000   40000   40000  150000  150000  250000  350000 2000000   40000
[10]   90000  700000  300000

What if I want to get the Sold price?

Is there a way to do that automatically?

We need to log in to get that data!

Log in to get more information!

  1. Press the “Log In” botton:
log_in_button <- remDr$findElement(using = "xpath", '//a[@data-text-content="Log In"]')
log_in_button$clickElement()
  1. Find the place where we should add the email:
email_field <- remDr$findElement(using = "xpath", '//input[@placeholder="Email address"]')
password_field <- remDr$findElement(using = "xpath", '//input[@placeholder="Password"]')
  1. Add this email and password already registered:
email_field$sendKeysToElement(list("r.workshop.umd@gmail.com"))
password_field$sendKeysToElement(list("Workshop123"))
  1. Click en “Log In” to return to our original website:
login_button <- remDr$findElement(using = "id", 'login-button-id')
login_button$clickElement()

Now try to get the sold price!

remDr$navigate("https://www.sothebys.com/en/auctions/2012/latin-american-art-n08862.html")
Sys.sleep(5)

sold_price <- artworks %>% html_node(xpath = './/div[@class="sold"]') %>% html_text(trim = TRUE) %>% str_replace("Lot Sold: ", "")
sold_price
 [1] "" "" "" "" "" "" NA "" "" "" "" ""

What if I want to build a dataset with this information?

artworks_info <- data.frame(
      Author = author,
      Title = title,
      Min_Estimate = min_estimate,
      Max_Estimate = max_estimate,
      Sold_Price = sold_price,
      stringsAsFactors = FALSE
    )
artworks_info
Author Title Min_Estimate Max_Estimate Sold_Price
1. Diego Rivera Mujer con dalias 40000 60000
2. Fernando Botero Naturaleza muerta con frutas 40000 50000
3. José Pancetti Saquarema 40000 60000
4. Héctor Poleo Maternidad 150000 200000
5. Emiliano di Cavalcanti Natureza morta 150000 200000
6. Matta Mourir pour le pain (Composition 50F) 250000 350000
7. Leonora Carrington Santa Teresa en la cocina 350000 450000 NA
NA 2000000 3000000
9. Carlos Mérida Fecundidad 40000 60000
10. Francisco Toledo Autorretrato 90000 120000
11. Armando Reverón Desnudo detrás de la mantilla 700000 900000
12. Fernando Botero Donna Seduta 300000 350000

Close the connection when you are done:

remDr$close()

Some ethical recommendations:

  1. Avoid Overloading the Server. Use the Sys.sleep() function to avoid this.
  1. Give Proper Attribution
  1. Test on Small Scales First
  1. If the Website/Platform have an API, use the API

Don’t get crazy about it: use the computer to do the boring stuff!

Don’t get crazy about it: use the computer to do the boring stuff!

Don’t get crazy about it: use the computer to do the boring stuff!

My question:

ChatGPT’s answer:

Some recommendations to use chatbots:

  1. Avoid ambiguity and ask for small tasks little by little… Divide and conquer!
  1. Trust, but also verify.

Source: Perkel (2023)

Scrape multiple pages with for loop:

remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4567L, browserName = "chrome")

# Intenta abrir el navegador
remDr$open()

remDr$navigate("https://www.sothebys.com/en/auctions/2012/latin-american-art-n08862.html")

# Columnas del dataframe
columns <- c("Author", "Title", "Min Estimate", "Max Estimate", "Sold Price", 
             "Auction Title", "Auction Date", "Sale Total", "Sale Number", 
             "Lots Count", "Web", "Web Number", "Code", "Year")
df_empty_total <- data.frame(matrix(ncol = length(columns), nrow = 0),  stringsAsFactors = FALSE)
df_empty_total[] <- lapply(df_empty_total, as.character)

colnames(df_empty_total) <- columns

# Loop para procesar cada página
for (web in paginas) {
  print(paste("Procesando:", web))

  remDr$navigate(web)
  Sys.sleep(5)
  
  # Intentar obtener el número de la última página
  tryCatch({
    last_page_number <- remDr$findElement(using = "xpath", '//a[@class="with-border "][last()]')$getElementText() %>% as.integer()
  }, error = function(e) {
    print(paste("Error to obtain number of pages:", e$message))
    last_page_number <- 1
  })
  
  for (i in 1:last_page_number) {
    print(paste("Page:", i, "from", last_page_number))
    
    # Navegar a la página correspondiente
    remDr$navigate(paste0(web, "?p=", i))
    Sys.sleep(2)
    
    # Intentar verificar el mensaje "Log in to view sale total"
    tryCatch({
      sale_total_message <- remDr$findElement(using = "xpath", '//div[contains(text(), "Log in to view sale total")]')
      if (!is.null(sale_total_message)) {
        print("Message 'Log in to view sale total' found, refresh the webpage...")
        remDr$refresh()
        Sys.sleep(5)
      }
    }, error = function(e) {
      print("Couldn't be found the message 'Log in to view sale total'")
    })
    
    # Extraer la información de las obras de arte
    artworks_info <- list()
    
    # Obtener los elementos de arte en la página
    tryCatch({
      page_source <- remDr$getPageSource()[[1]]
      page_html <- read_html(page_source)
      artworks <- page_html %>% html_nodes(xpath = '//li[@class="AuctionsModule-results-item"]')
      
      author <- artworks %>% html_node(xpath = './/div[@class="title "]/a') %>% html_text(trim = TRUE)
      title <- artworks %>% html_node(xpath = './/div[@class="description"]') %>% html_text(trim = TRUE)
      estimate_text <- artworks %>% html_node(xpath = './/div[@class="estimate"]') %>% html_text(trim = TRUE)
      estimates <- str_replace_all(estimate_text, "Estimate: |USD", "") %>% str_split(" – ") %>% .[[1]]
      min_estimate <- ifelse(length(estimates) >= 1, estimates[1], NA)
      max_estimate <- ifelse(length(estimates) == 2, estimates[2], NA)
      sold_price <- artworks %>% html_node(xpath = './/div[@class="sold"]') %>% html_text(trim = TRUE) %>% str_replace("Lot Sold: ", "")
      
      auction_title <- page_html %>% html_node(xpath = '//div[@class="AuctionsModule-auction-title"]') %>% html_text(trim = TRUE)
      auction_date <- page_html %>% html_node(xpath = '//div[@class="AuctionsModule-auction-info"]') %>% html_text(trim = TRUE)
      sale_total <- page_html %>% html_node(xpath = '//div[@class="AuctionsModule-auction-info-totalPrice"]') %>% html_text(trim = TRUE)
      sale_number <- page_html %>% html_node(xpath = '//div[@class="AuctionsModule-auction-info-saleNumber"]') %>% html_text(trim = TRUE)
      lots_count <- page_html %>% html_node(xpath = '//div[@class="AuctionsModule-lotsCount"]') %>% html_text(trim = TRUE)
      
      artworks_info <- data.frame(
        Author = author,
        Title = title,
        Min_Estimate = min_estimate,
        Max_Estimate = max_estimate,
        Sold_Price = sold_price,
        stringsAsFactors = FALSE
      )
      artworks_info$Auction_Title = auction_title
      artworks_info$Auction_Date = auction_date
      artworks_info$Sale_Total = sale_total
      artworks_info$Sale_Number = sale_number
      artworks_info$Lots_Count = lots_count
      artworks_info$Web = web
      artworks_info$Web_Number = i
    
    }, error = function(e) {
      print(paste("Error scraping data:", e$message))
    })
    
    df_empty_total <- bind_rows(df_empty_total, artworks_info)
    
    print(paste("Data extract from the page", i, " from", web))
  }
}

# Guardar el dataframe en Excel
write.xlsx(df_empty_total, "dataframe_auctions_LatinAmerica.xlsx")
remDr$close()

References

Perkel, J. M. 2023. “Six Tips for Better Coding with ChatGPT.” Nature (London) 618 (7964): 422–23.

Thank you! Questions?


guadag12@umd.edu

https://guadagonzalez.com/

@guadag12