💾 Archived View for republic.circumlunar.space › users › johngodlee › posts › 2021-07-05-catalogs.gm… captured on 2023-09-28 at 16:19:45. Gemini links have been rewritten to link to archived content

-=-=-=-=-=-=-

Scraping museum catalogues

DATE: 2021-07-05

AUTHOR: John L. Godlee

My partner is visiting some museums and art galleries in the eastern United States in the autumn, to look at Maya, Aztec and Mixtec artefacts that relate to slavery, captivity and forced labour. To find artefacts, she was looking through the online catalogues of each institution, and at the same time wanted to record metadata about the objects to refer back to later. Unfortunately, harvesting the metadata was taking a long time due to all the copying and pasting and manually saving images. I tried to help by writing a few scripts to scrape through the object records online and format the metadata in an organised format.

Some of the institutions provide decent APIs to get artefact data, but others only provide web pages, so I had to use a mixture of different methods to scrape the information.

The institutions I scraped were:

Dumbarton Oaks
Museum of Fine Arts Boston
Nasher Museum of Art at Duke University
The Metropolitan Museum of Art New York
Yale Peabody Museum of Natural History
Penn Museum
Princeton University Art Museum
Smithsonian National Museum of Natural History

For each of the institutions I was given a txt file of links. I used R to scrape the information as that's what I know best. For institutions who don't have APIs, i.e. Dumbarton Oaks, Museum of Fine Arts Boston, Nasher, Yale Peabody, and Penn Museum, I used {rvest} to parse the html files. For example, for Nasher:

# Packages
library(rvest)
library(dplyr)

# List record URLS
urls <- readLines("links.txt")

# Download pages
lapply(urls, function(x) {
  download.file(x, destfile = file.path("html", 
      gsub("/.*", "", gsub("https://emuseum.nasher.duke.edu/objects/", "", x))))
})

# List html files
html_files <- list.files("html", "*", full.names = TRUE)

# For each file
out_list <- lapply(html_files, function(x) {
  x <- read_html(x)

  # Get object title
  obj_title <- x %>%
    html_nodes("div.titleField") %>%
    html_nodes("h1") %>%
    html_text()

  # Get object metadata
  obj_labels <- x %>%
    html_nodes("span.detailFieldLabel") %>%
    html_text() %>%
    gsub(":.*", "", .)

  obj_values <- x %>%
    html_nodes("span.detailFieldValue") %>%
    html_text()

  # Create dataframe
  out <- as.data.frame(t(data.frame(obj_values)))
  names(out) <- obj_labels

  # Extract image IDs
  main_img_id <- x %>%
    html_nodes("div.emuseum-img-wrap") %>%
    html_nodes("img") %>%
    html_attr("src") %>%
    gsub("/internal/media/dispatcher/", "", .) %>%
    gsub("/.*", "", .) %>%
    unique()

  sec_img_id <- x %>% 
    html_nodes("div.secondarymedia-item") %>%
    html_nodes("a") %>%
    html_attr("data-media-id") %>%
    unique() 

  img_id <- unique(c(main_img_id, sec_img_id))

  # Construct image URLs
  img_url <- paste0(
    "https://emuseum.nasher.duke.edu/internal/media/dispatcher/", 
    img_id, 
    "/resize%3Aformat%3Dfull")

  # Create filenames
  img_filenames <- paste0(out


gemini - kennedy.gemi.dev




Object number`, "_", img_id, ".jpg")

  # Download images
  if (length(img_url[!is.na(img_url)]) > 1) {
    download.file(img_url, destfile = file.path("img", img_filenames), 
      method = "libcurl")
  } else if (length(img_url[!is.na(img_url)]) == 1) { 
    download.file(img_url, destfile = file.path("img", img_filenames))
  }

  return(out)
})

# Write metadata to csv
out <- do.call(bind_rows, out_list)

write.csv(out, "all.csv", row.names = FALSE)

I think Princeton probably had the nicest and simplest API to use, while the Smithsonian had the most difficult API. However, the complexity of the Smithsonian API is probably because they have lots of institutions all running the same API, and a very diverse range of records.

To query the API I used {httr}, and to parse the JSON returned by the APIs I used {jsonlite}. Using the Princeton API as an example:

library(httr)
library(jsonlite)
library(dplyr)

base <- "https://data.artmuseum.princeton.edu/objects/"

# Import links
links <- readLines("links.txt")

# Get IDs
ids <- gsub(".*/", "", links)

# For each ID, get record
out_list <- lapply(ids, function(x) {
  message(x)
  # Get record
  resp <- GET(paste0(base, x))

  # Parse JSON
  resp_parsed <- content(resp, as = "parsed")

  # Save JSON 
  write(content(resp, as = "text"), file.path("json", paste0(x, ".json")))

  ifnull <- function(x) { 
    if (is.null(x)) { 
      return("NA")
    } else {
      return(x) 
    }
  }

  # Extract description
  desc_df <- data.frame(
    displayperiod = ifnull(resp_parsed$displayperiod),
    displayculture = ifnull(resp_parsed$displayculture),
    classification = ifnull(resp_parsed$classification),
    daterange = ifnull(resp_parsed$daterange),
    description = ifnull(paste(lapply(resp_parsed$texts, function(x) {
      x$textentryhtml
    }), collapse = "; ")),
    accessionyear = ifnull(resp_parsed$accessionyear),
    title = ifnull(resp_parsed$titles[[1]]$title),
    catalograisonne = ifnull(resp_parsed$catalograisonne),
    objectnumber = ifnull(resp_parsed$objectnumber),
    objectid = ifnull(resp_parsed$objectid),
    department = ifnull(resp_parsed$department),
    country = ifnull(resp_parsed$geography[[1]]$country),
    locale = ifnull(resp_parsed$geography[[1]]$locale),
    region = ifnull(resp_parsed$geography[[1]]$region),
    subcontinent = ifnull(resp_parsed$geography[[1]]$subcontinent),
    locus = ifnull(resp_parsed$geography[[1]]$locus),
    county = ifnull(resp_parsed$geography[[1]]$county),
    excavation = ifnull(resp_parsed$geography[[1]]$excavation),
    state = ifnull(resp_parsed$geography[[1]]$state),
    latitude = ifnull(resp_parsed$geography[[1]]$location$lat),
    longitude = ifnull(resp_parsed$geography[[1]]$location$lon),
    river = ifnull(resp_parsed$geography[[1]]$location$river),
    continent = ifnull(resp_parsed$geography[[1]]$continent),
    medium = ifnull(resp_parsed$medium),
    dimensions = ifnull(paste(lapply(resp_parsed$dimensionelements, function(x) {
      paste(x$type, x$dimension, x$element, x$units, sep = ":")
    }), collapse = "; "))
    )

  img_list <- lapply(resp_parsed$media, function(x) {
    c(x$uri, x$id)
    })

  img_filenames <- paste0(x, "_", lapply(img_list, "[", 2),  ".jpg")

  img_urls <- paste0(lapply(img_list, "[", 1), "/full/full/0/default.jpg")

  if (length(img_list[!is.na(img_list)]) > 1) {
    try(download.file(img_urls, destfile = file.path("img", img_filenames), 
      method = "libcurl"))
  } else if (length(img_list[!is.na(img_list)]) == 1) { 
    try(download.file(img_urls, destfile = file.path("img", img_filenames)))
  }

  return(desc_df)
})

# Write metadata to csv
out <- do.call(bind_rows, out_list)

write.csv(out, "all.csv", row.names = FALSE)